Merge pull request #8568 from ThomasWaldmann/remove-chunks-healthy

Item: remove .chunks_healthy, fixes #8559
This commit is contained in:
TW 2025-04-11 21:11:42 +02:00 committed by GitHub
commit e12b3bb2f6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 227 additions and 311 deletions

View file

@ -480,8 +480,8 @@ Refer to the *borg list* documentation for the available keys and their meaning.
Example (excerpt) of ``borg list --json-lines``::
{"type": "d", "mode": "drwxr-xr-x", "user": "user", "group": "user", "uid": 1000, "gid": 1000, "path": "linux", "healthy": true, "target": "", "flags": null, "mtime": "2017-02-27T12:27:20.023407", "size": 0}
{"type": "d", "mode": "drwxr-xr-x", "user": "user", "group": "user", "uid": 1000, "gid": 1000, "path": "linux/baz", "healthy": true, "target": "", "flags": null, "mtime": "2017-02-27T12:27:20.585407", "size": 0}
{"type": "d", "mode": "drwxr-xr-x", "user": "user", "group": "user", "uid": 1000, "gid": 1000, "path": "linux", "target": "", "flags": null, "mtime": "2017-02-27T12:27:20.023407", "size": 0}
{"type": "d", "mode": "drwxr-xr-x", "user": "user", "group": "user", "uid": 1000, "gid": 1000, "path": "linux/baz", "target": "", "flags": null, "mtime": "2017-02-27T12:27:20.585407", "size": 0}
Archive Differencing

View file

@ -273,14 +273,16 @@ class DownloadPipeline:
"""
self.hlids_preloaded = set()
unpacker = msgpack.Unpacker(use_list=False)
for data in self.fetch_many(ids, ro_type=ROBJ_ARCHIVE_STREAM):
for data in self.fetch_many(ids, ro_type=ROBJ_ARCHIVE_STREAM, replacement_chunk=False):
if data is None:
continue # archive stream chunk missing
unpacker.feed(data)
for _item in unpacker:
item = Item(internal_dict=_item)
if filter is None or filter(item):
if "chunks" in item:
item.chunks = [ChunkListEntry(*e) for e in item.chunks]
if "chunks_healthy" in item:
if "chunks_healthy" in item: # legacy
item.chunks_healthy = [ChunkListEntry(*e) for e in item.chunks_healthy]
yield item
@ -312,10 +314,32 @@ class DownloadPipeline:
self.repository.preload([c.id for c in item.chunks])
return preload_chunks
def fetch_many(self, ids, is_preloaded=False, ro_type=None):
def fetch_many(self, chunks, is_preloaded=False, ro_type=None, replacement_chunk=True):
assert ro_type is not None
for id_, cdata in zip(ids, self.repository.get_many(ids, is_preloaded=is_preloaded)):
_, data = self.repo_objs.parse(id_, cdata, ro_type=ro_type)
ids = []
sizes = []
if all(isinstance(chunk, ChunkListEntry) for chunk in chunks):
for chunk in chunks:
ids.append(chunk.id)
sizes.append(chunk.size)
elif all(isinstance(chunk, bytes) for chunk in chunks):
ids = list(chunks)
sizes = [None] * len(ids)
else:
raise TypeError(f"unsupported or mixed element types: {chunks}")
for id, size, cdata in zip(
ids, sizes, self.repository.get_many(ids, is_preloaded=is_preloaded, raise_missing=False)
):
if cdata is None:
if replacement_chunk and size is not None:
logger.error(f"repository object {bin_to_hex(id)} missing, returning {size} zero bytes.")
data = zeros[:size] # return an all-zero replacement chunk of correct size
else:
logger.error(f"repository object {bin_to_hex(id)} missing, returning None.")
data = None
else:
_, data = self.repo_objs.parse(id, cdata, ro_type=ro_type)
assert size is None or len(data) == size
yield data
@ -762,7 +786,6 @@ Duration: {0.duration}
# if a previous extraction was interrupted between setting the mtime and setting non-default flags.
return True
has_damaged_chunks = "chunks_healthy" in item
if dry_run or stdout:
with self.extract_helper(item, "", hlm, dry_run=dry_run or stdout) as hardlink_set:
if not hardlink_set:
@ -771,9 +794,7 @@ Duration: {0.duration}
# it would get stuck.
if "chunks" in item:
item_chunks_size = 0
for data in self.pipeline.fetch_many(
[c.id for c in item.chunks], is_preloaded=True, ro_type=ROBJ_FILE_STREAM
):
for data in self.pipeline.fetch_many(item.chunks, is_preloaded=True, ro_type=ROBJ_FILE_STREAM):
if pi:
pi.show(increase=len(data), info=[remove_surrogates(item.path)])
if stdout:
@ -789,8 +810,6 @@ Duration: {0.duration}
item_size, item_chunks_size
)
)
if has_damaged_chunks:
raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.")
return
dest = self.cwd
@ -824,8 +843,7 @@ Duration: {0.duration}
with backup_io("open"):
fd = open(path, "wb")
with fd:
ids = [c.id for c in item.chunks]
for data in self.pipeline.fetch_many(ids, is_preloaded=True, ro_type=ROBJ_FILE_STREAM):
for data in self.pipeline.fetch_many(item.chunks, is_preloaded=True, ro_type=ROBJ_FILE_STREAM):
if pi:
pi.show(increase=len(data), info=[remove_surrogates(item.path)])
with backup_io("write"):
@ -845,8 +863,6 @@ Duration: {0.duration}
raise BackupError(
f"Size inconsistency detected: size {item_size}, chunks size {item_chunks_size}"
)
if has_damaged_chunks:
raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.")
return
with backup_io:
# No repository access beyond this point.
@ -1010,8 +1026,8 @@ Duration: {0.duration}
path,
item1,
item2,
archive1.pipeline.fetch_many([c.id for c in item1.get("chunks", [])], ro_type=ROBJ_FILE_STREAM),
archive2.pipeline.fetch_many([c.id for c in item2.get("chunks", [])], ro_type=ROBJ_FILE_STREAM),
archive1.pipeline.fetch_many(item1.get("chunks", []), ro_type=ROBJ_FILE_STREAM),
archive2.pipeline.fetch_many(item2.get("chunks", []), ro_type=ROBJ_FILE_STREAM),
can_compare_chunk_ids=can_compare_chunk_ids,
)
@ -1159,10 +1175,6 @@ class ChunksProcessor:
return chunk_entry
item.chunks = []
# if we rechunkify, we'll get a fundamentally different chunks list, thus we need
# to get rid of .chunks_healthy, as it might not correspond to .chunks any more.
if self.rechunkify and "chunks_healthy" in item:
del item.chunks_healthy
for chunk in chunk_iter:
chunk_entry = chunk_processor(chunk)
item.chunks.append(chunk_entry)
@ -1779,13 +1791,10 @@ class ArchiveChecker:
if defect_chunks:
if self.repair:
# if we kill the defect chunk here, subsequent actions within this "borg check"
# run will find missing chunks and replace them with all-zero replacement
# chunks and flag the files as "repaired".
# if another backup is done later and the missing chunks get backed up again,
# a "borg check" afterwards can heal all files where this chunk was missing.
# run will find missing chunks.
logger.warning(
"Found defect chunks. They will be deleted now, so affected files can "
"get repaired now and maybe healed later."
"Found defect chunks and will delete them now. "
"Reading files referencing these chunks will result in an I/O error."
)
for defect_chunk in defect_chunks:
# remote repo (ssh): retry might help for strange network / NIC / RAM errors
@ -1805,10 +1814,7 @@ class ArchiveChecker:
else:
logger.warning("chunk %s not deleted, did not consistently fail.", bin_to_hex(defect_chunk))
else:
logger.warning(
"Found defect chunks. With --repair, they would get deleted, so affected "
"files could get repaired then and maybe healed later."
)
logger.warning("Found defect chunks. With --repair, they would get deleted.")
for defect_chunk in defect_chunks:
logger.debug("chunk %s is defect.", bin_to_hex(defect_chunk))
log = logger.error if errors else logger.info
@ -1919,80 +1925,18 @@ class ArchiveChecker:
self.repository.put(id_, cdata)
def verify_file_chunks(archive_name, item):
"""Verifies that all file chunks are present.
Missing file chunks will be replaced with new chunks of the same length containing all zeros.
If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one.
"""
def replacement_chunk(size):
chunk = Chunk(None, allocation=CH_ALLOC, size=size)
chunk_id, data = cached_hash(chunk, self.key.id_hash)
cdata = self.repo_objs.format(chunk_id, {}, data, ro_type=ROBJ_FILE_STREAM)
return chunk_id, size, cdata
"""Verifies that all file chunks are present. Missing file chunks will be logged."""
offset = 0
chunk_list = []
chunks_replaced = False
has_chunks_healthy = "chunks_healthy" in item
chunks_current = item.chunks
chunks_healthy = item.chunks_healthy if has_chunks_healthy else chunks_current
if has_chunks_healthy and len(chunks_current) != len(chunks_healthy):
# should never happen, but there was issue #3218.
logger.warning(f"{archive_name}: {item.path}: Invalid chunks_healthy metadata removed!")
del item.chunks_healthy
has_chunks_healthy = False
chunks_healthy = chunks_current
for chunk_current, chunk_healthy in zip(chunks_current, chunks_healthy):
chunk_id, size = chunk_healthy
for chunk in item.chunks:
chunk_id, size = chunk
if chunk_id not in self.chunks:
# a chunk of the healthy list is missing
if chunk_current == chunk_healthy:
logger.error(
"{}: {}: New missing file chunk detected (Byte {}-{}, Chunk {}). "
"Replacing with all-zero chunk.".format(
archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
)
logger.error(
"{}: {}: Missing file chunk detected (Byte {}-{}, Chunk {}).".format(
archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
)
self.error_found = chunks_replaced = True
chunk_id, size, cdata = replacement_chunk(size)
add_reference(chunk_id, size, cdata)
else:
logger.info(
"{}: {}: Previously missing file chunk is still missing (Byte {}-{}, Chunk {}). "
"It has an all-zero replacement chunk already.".format(
archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
)
)
chunk_id, size = chunk_current
if chunk_id not in self.chunks:
logger.warning(
"{}: {}: Missing all-zero replacement chunk detected (Byte {}-{}, Chunk {}). "
"Generating new replacement chunk.".format(
archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
)
)
self.error_found = chunks_replaced = True
chunk_id, size, cdata = replacement_chunk(size)
add_reference(chunk_id, size, cdata)
else:
if chunk_current == chunk_healthy:
pass # normal case, all fine.
else:
logger.info(
"{}: {}: Healed previously missing file chunk! (Byte {}-{}, Chunk {}).".format(
archive_name, item.path, offset, offset + size, bin_to_hex(chunk_id)
)
)
chunk_list.append([chunk_id, size]) # list-typed element as chunks_healthy is list-of-lists
)
self.error_found = True
offset += size
if chunks_replaced and not has_chunks_healthy:
# if this is first repair, remember the correct chunk IDs, so we can maybe heal the file later
item.chunks_healthy = item.chunks
if has_chunks_healthy and chunk_list == chunks_healthy:
logger.info(f"{archive_name}: {item.path}: Completely healed previously damaged file!")
del item.chunks_healthy
item.chunks = chunk_list
if "size" in item:
item_size = item.size
item_chunks_size = item.get_size(from_chunks=True)
@ -2270,7 +2214,7 @@ class ArchiveRecreater:
return chunk_entry
def iter_chunks(self, archive, target, chunks):
chunk_iterator = archive.pipeline.fetch_many([chunk_id for chunk_id, _ in chunks], ro_type=ROBJ_FILE_STREAM)
chunk_iterator = archive.pipeline.fetch_many(chunks, ro_type=ROBJ_FILE_STREAM)
if target.recreate_rechunkify:
# The target.chunker will read the file contents through ChunkIteratorFileWrapper chunk-by-chunk
# (does not load the entire file into memory)

View file

@ -168,28 +168,7 @@ class CheckMixIn:
2. When checking the consistency and correctness of archives, repair mode might
remove whole archives from the manifest if their archive metadata chunk is
corrupt or lost. On a chunk level (i.e. the contents of files), repair mode
will replace corrupt or lost chunks with a same-size replacement chunk of
zeroes. If a previously zeroed chunk reappears, repair mode will restore
this lost chunk using the new chunk.
Most steps taken by repair mode have a one-time effect on the repository, like
removing a lost archive from the repository. However, replacing a corrupt or
lost chunk with an all-zero replacement will have an ongoing effect on the
repository: When attempting to extract a file referencing an all-zero chunk,
the ``extract`` command will distinctly warn about it. The FUSE filesystem
created by the ``mount`` command will reject reading such a "zero-patched"
file unless a special mount option is given.
As mentioned earlier, Borg might be able to "heal" a "zero-patched" file in
repair mode, if all its previously lost chunks reappear (e.g. via a later
backup). This is achieved by Borg not only keeping track of the all-zero
replacement chunks, but also by keeping metadata about the lost chunks. In
repair mode Borg will check whether a previously lost chunk reappeared and will
replace the all-zero replacement chunk by the reappeared chunk. If all lost
chunks of a "zero-patched" file reappear, this effectively "heals" the file.
Consequently, if lost chunks were repaired earlier, it is advised to run
``--repair`` a second time after creating some new backups.
corrupt or lost. Borg will also report files that reference missing chunks.
If ``--repair --find-lost-archives`` is given, previously lost entries will
be recreated in the archive directory. This is only possible before

View file

@ -6,7 +6,7 @@ from ..archive import Archive
from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo
from ..constants import * # NOQA
from ..hashindex import ChunkIndex, ChunkIndexEntry
from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex
from ..helpers import set_ec, EXIT_ERROR, format_file_size, bin_to_hex
from ..helpers import ProgressIndicatorPercent
from ..manifest import Manifest
from ..remote import RemoteRepository
@ -39,9 +39,7 @@ class ArchiveGarbageCollector:
logger.info("Starting compaction / garbage collection...")
self.chunks = self.get_repository_chunks()
logger.info("Computing object IDs used by archives...")
(self.missing_chunks, self.reappeared_chunks, self.total_files, self.total_size, self.archives_count) = (
self.analyze_archives()
)
(self.missing_chunks, self.total_files, self.total_size, self.archives_count) = self.analyze_archives()
self.report_and_delete()
self.save_chunk_index()
logger.info("Finished compaction / garbage collection...")
@ -73,28 +71,24 @@ class ArchiveGarbageCollector:
self.chunks.clear() # we already have updated the repo cache in get_repository_chunks
self.chunks = None # nothing there (cleared!)
def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
"""Iterate over all items in all archives, create the dicts id -> size of all used/wanted chunks."""
def analyze_archives(self) -> Tuple[Set, int, int, int]:
"""Iterate over all items in all archives, create the dicts id -> size of all used chunks."""
def use_it(id, *, wanted=False):
def use_it(id):
entry = self.chunks.get(id)
if entry is not None:
# the chunk is in the repo, mark it used.
self.chunks[id] = entry._replace(flags=entry.flags | ChunkIndex.F_USED)
if wanted:
# chunk id is from chunks_healthy list: a lost chunk has re-appeared!
reappeared_chunks.add(id)
else:
# with --stats: we do NOT have this chunk in the repository!
# without --stats: we do not have this chunk or the chunks index is incomplete.
missing_chunks.add(id)
missing_chunks: set[bytes] = set()
reappeared_chunks: set[bytes] = set()
archive_infos = self.manifest.archives.list(sort_by=["ts"])
num_archives = len(archive_infos)
pi = ProgressIndicatorPercent(
total=num_archives, msg="Computing used/wanted chunks %3.1f%%", step=0.1, msgid="compact.analyze_archives"
total=num_archives, msg="Computing used chunks %3.1f%%", step=0.1, msgid="compact.analyze_archives"
)
total_size, total_files = 0, 0
for i, info in enumerate(archive_infos):
@ -114,25 +108,14 @@ class ArchiveGarbageCollector:
for id, size in item.chunks:
total_size += size # original, uncompressed file content size
use_it(id)
if "chunks_healthy" in item:
# we also consider the chunks_healthy chunks as referenced - do not throw away
# anything that borg check --repair might still need.
for id, size in item.chunks_healthy:
use_it(id, wanted=True)
pi.finish()
return missing_chunks, reappeared_chunks, total_files, total_size, num_archives
return missing_chunks, total_files, total_size, num_archives
def report_and_delete(self):
run_repair = " Run borg check --repair!"
if self.missing_chunks:
logger.error(f"Repository has {len(self.missing_chunks)} missing objects." + run_repair)
logger.error(f"Repository has {len(self.missing_chunks)} missing objects!")
set_ec(EXIT_ERROR)
if self.reappeared_chunks:
logger.warning(f"{len(self.reappeared_chunks)} previously missing objects re-appeared!" + run_repair)
set_ec(EXIT_WARNING)
logger.info("Cleaning archives directory from soft-deleted archives...")
archive_infos = self.manifest.archives.list(sort_by=["ts"], deleted=True)
for archive_info in archive_infos:

View file

@ -104,9 +104,9 @@ class MountMixIn:
- ``versions``: when used with a repository mount, this gives a merged, versioned
view of the files in the archives. EXPERIMENTAL, layout may change in future.
- ``allow_damaged_files``: by default damaged files (where missing chunks were
replaced with runs of zeros by ``borg check --repair``) are not readable and
return EIO (I/O error). Set this option to read such files.
- ``allow_damaged_files``: by default damaged files (where chunks are missing)
will return EIO (I/O error) when trying to read the related parts of the file.
Set this option to replace the missing parts with all-zero bytes.
- ``ignore_permissions``: for security reasons the ``default_permissions`` mount
option is internally enforced by borg. ``ignore_permissions`` can be given to
not enforce ``default_permissions``.

View file

@ -95,16 +95,10 @@ class RecreateMixIn:
at least the entire deduplicated size of the archives using the previous
chunker params.
If you recently ran borg check --repair and it had to fix lost chunks with all-zero
replacement chunks, please first run another backup for the same data and re-run
borg check --repair afterwards to heal any archives that had lost chunks which are
still generated from the input data.
Important: running borg recreate to re-chunk will remove the chunks_healthy
metadata of all items with replacement chunks, so healing will not be possible
any more after re-chunking (it is also unlikely it would ever work: due to the
change of chunking parameters, the missing chunk likely will never be seen again
even if you still have the data that produced it).
If your most recent borg check found missing chunks, please first run another
backup for the same data, before doing any rechunking. If you are lucky, that
will re-create the missing chunks. Optionally, do another borg check, to see
if the chunks are still missing).
"""
)
subparser = subparsers.add_parser(

View file

@ -113,9 +113,7 @@ class TarMixIn:
"""
Return a file-like object that reads from the chunks of *item*.
"""
chunk_iterator = archive.pipeline.fetch_many(
[chunk_id for chunk_id, _ in item.chunks], is_preloaded=True, ro_type=ROBJ_FILE_STREAM
)
chunk_iterator = archive.pipeline.fetch_many(item.chunks, is_preloaded=True, ro_type=ROBJ_FILE_STREAM)
if pi:
info = [remove_surrogates(item.path)]
return ChunkIteratorFileWrapper(

View file

@ -9,6 +9,8 @@ from ..helpers import Error
from ..helpers import location_validator, Location, archivename_validator, comment_validator
from ..helpers import format_file_size, bin_to_hex
from ..manifest import Manifest
from ..legacyrepository import LegacyRepository
from ..repository import Repository
from ..logger import create_logger
@ -111,51 +113,64 @@ class TransferMixIn:
# so let's remove them from old archives also, considering there is no
# code any more that deals with them in special ways (e.g. to get stats right).
continue
if "chunks" in item:
if "chunks_healthy" in item: # legacy
other_chunks = item.chunks_healthy # chunks_healthy has the CORRECT chunks list, if present.
elif "chunks" in item:
other_chunks = item.chunks
else:
other_chunks = None
if other_chunks is not None:
chunks = []
for chunk_id, size in item.chunks:
for chunk_id, size in other_chunks:
chunk_present = cache.seen_chunk(chunk_id, size)
if not chunk_present: # target repo does not yet have this chunk
if not dry_run:
cdata = other_repository.get(chunk_id)
if args.recompress == "never":
# keep compressed payload same, verify via assert_id (that will
# decompress, but avoid needing to compress it again):
meta, data = other_manifest.repo_objs.parse(
chunk_id,
cdata,
decompress=True,
want_compressed=True,
ro_type=ROBJ_FILE_STREAM,
)
meta, data = upgrader.upgrade_compressed_chunk(meta, data)
chunk_entry = cache.add_chunk(
chunk_id,
meta,
data,
stats=archive.stats,
wait=False,
compress=False,
size=size,
ctype=meta["ctype"],
clevel=meta["clevel"],
ro_type=ROBJ_FILE_STREAM,
)
elif args.recompress == "always":
# always decompress and re-compress file data chunks
meta, data = other_manifest.repo_objs.parse(
chunk_id, cdata, ro_type=ROBJ_FILE_STREAM
)
chunk_entry = cache.add_chunk(
chunk_id,
meta,
data,
stats=archive.stats,
wait=False,
ro_type=ROBJ_FILE_STREAM,
)
try:
cdata = other_repository.get(chunk_id)
except (Repository.ObjectNotFound, LegacyRepository.ObjectNotFound):
# missing correct chunk in other_repository (source) will result in
# a missing chunk in repository (destination).
# we do NOT want to transfer all-zero replacement chunks from borg1 repos.
pass
else:
raise ValueError(f"unsupported recompress mode: {args.recompress}")
if args.recompress == "never":
# keep compressed payload same, verify via assert_id (that will
# decompress, but avoid needing to compress it again):
meta, data = other_manifest.repo_objs.parse(
chunk_id,
cdata,
decompress=True,
want_compressed=True,
ro_type=ROBJ_FILE_STREAM,
)
meta, data = upgrader.upgrade_compressed_chunk(meta, data)
chunk_entry = cache.add_chunk(
chunk_id,
meta,
data,
stats=archive.stats,
wait=False,
compress=False,
size=size,
ctype=meta["ctype"],
clevel=meta["clevel"],
ro_type=ROBJ_FILE_STREAM,
)
elif args.recompress == "always":
# always decompress and re-compress file data chunks
meta, data = other_manifest.repo_objs.parse(
chunk_id, cdata, ro_type=ROBJ_FILE_STREAM
)
chunk_entry = cache.add_chunk(
chunk_id,
meta,
data,
stats=archive.stats,
wait=False,
ro_type=ROBJ_FILE_STREAM,
)
else:
raise ValueError(f"unsupported recompress mode: {args.recompress}")
cache.repository.async_response(wait=False)
chunks.append(chunk_entry)
transfer_size += size
@ -165,7 +180,7 @@ class TransferMixIn:
chunks.append(chunk_entry)
present_size += size
if not dry_run:
item.chunks = chunks # TODO: overwrite? IDs and sizes are same.
item.chunks = chunks
archive.stats.nfiles += 1
if not dry_run:
item = upgrader.upgrade_item(item=item)

View file

@ -10,7 +10,7 @@ import time
from collections import defaultdict, Counter
from signal import SIGINT
from .constants import ROBJ_FILE_STREAM
from .constants import ROBJ_FILE_STREAM, zeros
from .fuse_impl import llfuse, has_pyfuse3
@ -46,6 +46,7 @@ from .helpers.lrucache import LRUCache
from .item import Item
from .platform import uid2user, gid2group
from .platformflags import is_darwin
from .repository import Repository
from .remote import RemoteRepository
@ -652,17 +653,6 @@ class FuseOperations(llfuse.Operations, FuseBackend):
@async_wrapper
def open(self, inode, flags, ctx=None):
if not self.allow_damaged_files:
item = self.get_item(inode)
if "chunks_healthy" in item:
# Processed archive items don't carry the path anymore; for converting the inode
# to the path we'd either have to store the inverse of the current structure,
# or search the entire archive. So we just don't print it. It's easy to correlate anyway.
logger.warning(
"File has damaged (all-zero) chunks. Try running borg check --repair. "
"Mount with allow_damaged_files to read damaged files."
)
raise llfuse.FUSEError(errno.EIO)
return llfuse.FileInfo(fh=inode) if has_pyfuse3 else inode
@async_wrapper
@ -699,7 +689,16 @@ class FuseOperations(llfuse.Operations, FuseBackend):
# evict fully read chunk from cache
del self.data_cache[id]
else:
_, data = self.repo_objs.parse(id, self.repository_uncached.get(id), ro_type=ROBJ_FILE_STREAM)
try:
cdata = self.repository_uncached.get(id)
except Repository.ObjectNotFound:
if self.allow_damaged_files:
data = zeros[:s]
assert len(data) == s
else:
raise llfuse.FUSEError(errno.EIO) from None
else:
_, data = self.repo_objs.parse(id, cdata, ro_type=ROBJ_FILE_STREAM)
if offset + n < len(data):
# chunk was only partially read, cache it
self.data_cache[id] = data

View file

@ -308,8 +308,8 @@ class HardLinkManager:
If we encounter the same hlid again later, we hardlink to the path of the already extracted content of same hlid.
C) When transferring from a borg1 archive, we need:
path -> chunks, chunks_healthy # for borg1_hl_targets
If we encounter a regular file item with source == path later, we reuse chunks and chunks_healthy
path -> chunks_correct # for borg1_hl_targets, chunks_correct must be either from .chunks_healthy or .chunks.
If we encounter a regular file item with source == path later, we reuse chunks_correct
and create the same hlid = hardlink_id_from_path(source).
D) When importing a tar file (simplified 1-pass way for now, not creating borg hardlink items):
@ -353,7 +353,7 @@ class HardLinkManager:
a hlid (new borg style) [bytes]
a (dev, inode) tuple (filesystem)
:param info: information to remember, could be:
chunks / chunks_healthy list
chunks list
hlid
"""
assert isinstance(id, self.id_type), f"id is {id!r}, not of type {self.id_type}"

View file

@ -124,7 +124,7 @@ class ChunkIteratorFileWrapper:
def open_item(archive, item):
"""Return file-like object for archived item (with chunks)."""
chunk_iterator = archive.pipeline.fetch_many([c.id for c in item.chunks], ro_type=ROBJ_FILE_STREAM)
chunk_iterator = archive.pipeline.fetch_many(item.chunks, ro_type=ROBJ_FILE_STREAM)
return ChunkIteratorFileWrapper(chunk_iterator)

View file

@ -827,7 +827,6 @@ class ItemFormatter(BaseFormatter):
"isoctime": "file change time (ISO 8601 format)",
"isoatime": "file access time (ISO 8601 format)",
"xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
"health": 'either "healthy" (file ok) or "broken" (if file has all-zero replacement chunks)',
"archiveid": "internal ID of the archive",
"archivename": "name of the archive",
}
@ -837,7 +836,6 @@ class ItemFormatter(BaseFormatter):
("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"),
tuple(sorted(hash_algorithms)),
("archiveid", "archivename", "extra"),
("health",),
)
KEYS_REQUIRING_CACHE = ()
@ -894,10 +892,6 @@ class ItemFormatter(BaseFormatter):
item_data.update(text_to_json("user", item.get("user", str(item_data["uid"]))))
item_data.update(text_to_json("group", item.get("group", str(item_data["gid"]))))
if jsonline:
item_data["healthy"] = "chunks_healthy" not in item
else:
item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
item_data["flags"] = item.get("bsdflags") # int if flags known, else (if flags unknown) None
for key in self.used_call_keys:
item_data[key] = self.call_keys[key](item)
@ -917,7 +911,7 @@ class ItemFormatter(BaseFormatter):
hash = self.xxh64()
elif hash_function in self.hash_algorithms:
hash = hashlib.new(hash_function)
for data in self.archive.pipeline.fetch_many([c.id for c in item.chunks], ro_type=ROBJ_FILE_STREAM):
for data in self.archive.pipeline.fetch_many(item.chunks, ro_type=ROBJ_FILE_STREAM):
hash.update(data)
return hash.hexdigest()

View file

@ -1202,18 +1202,21 @@ class LegacyRepository:
self.index = self.open_index(self.get_transaction_id())
return [id_ for id_, _ in islice(self.index.iteritems(marker=marker), limit)]
def get(self, id, read_data=True):
def get(self, id, read_data=True, raise_missing=True):
if not self.index:
self.index = self.open_index(self.get_transaction_id())
try:
in_index = NSIndex1Entry(*(self.index[id][:2])) # legacy: index entries have no size element
return self.io.read(in_index.segment, in_index.offset, id, read_data=read_data)
except KeyError:
raise self.ObjectNotFound(id, self.path) from None
if raise_missing:
raise self.ObjectNotFound(id, self.path) from None
else:
return None
def get_many(self, ids, read_data=True, is_preloaded=False):
def get_many(self, ids, read_data=True, is_preloaded=False, raise_missing=True):
for id_ in ids:
yield self.get(id_, read_data=read_data)
yield self.get(id_, read_data=read_data, raise_missing=raise_missing)
def put(self, id, data, wait=True):
"""put a repo object

View file

@ -943,7 +943,9 @@ class RemoteRepository:
self.to_send.push_back(msgpack.packb({MSGID: self.msgid, MSG: cmd, ARGS: args}))
if not self.to_send and self.preload_ids:
chunk_id = self.preload_ids.pop(0)
args = {"id": chunk_id}
# for preloading chunks, the raise_missing behaviour is defined HERE,
# not in the get_many / fetch_many call that later fetches the preloaded chunks.
args = {"id": chunk_id, "raise_missing": False}
self.msgid += 1
self.chunkid_to_msgids.setdefault(chunk_id, []).append(self.msgid)
self.to_send.push_back(msgpack.packb({MSGID: self.msgid, MSG: "get", ARGS: args}))
@ -991,12 +993,16 @@ class RemoteRepository:
def list(self, limit=None, marker=None):
"""actual remoting is done via self.call in the @api decorator"""
def get(self, id, read_data=True):
for resp in self.get_many([id], read_data=read_data):
def get(self, id, read_data=True, raise_missing=True):
for resp in self.get_many([id], read_data=read_data, raise_missing=raise_missing):
return resp
def get_many(self, ids, read_data=True, is_preloaded=False):
yield from self.call_many("get", [{"id": id, "read_data": read_data} for id in ids], is_preloaded=is_preloaded)
def get_many(self, ids, read_data=True, is_preloaded=False, raise_missing=True):
yield from self.call_many(
"get",
[{"id": id, "read_data": read_data, "raise_missing": raise_missing} for id in ids],
is_preloaded=is_preloaded,
)
@api(since=parse_version("1.0.0"))
def put(self, id, data, wait=True):
@ -1098,11 +1104,11 @@ class RepositoryNoCache:
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def get(self, key, read_data=True):
return next(self.get_many([key], read_data=read_data, cache=False))
def get(self, key, read_data=True, raise_missing=True):
return next(self.get_many([key], read_data=read_data, raise_missing=raise_missing, cache=False))
def get_many(self, keys, read_data=True, cache=True):
for key, data in zip(keys, self.repository.get_many(keys, read_data=read_data)):
def get_many(self, keys, read_data=True, raise_missing=True, cache=True):
for key, data in zip(keys, self.repository.get_many(keys, read_data=read_data, raise_missing=raise_missing)):
yield self.transform(key, data)
def log_instrumentation(self):
@ -1207,10 +1213,12 @@ class RepositoryCache(RepositoryNoCache):
self.cache.clear()
shutil.rmtree(self.basedir)
def get_many(self, keys, read_data=True, cache=True):
def get_many(self, keys, read_data=True, raise_missing=True, cache=True):
# It could use different cache keys depending on read_data and cache full vs. meta-only chunks.
unknown_keys = [key for key in keys if self.prefixed_key(key, complete=read_data) not in self.cache]
repository_iterator = zip(unknown_keys, self.repository.get_many(unknown_keys, read_data=read_data))
repository_iterator = zip(
unknown_keys, self.repository.get_many(unknown_keys, read_data=read_data, raise_missing=raise_missing)
)
for key in keys:
pkey = self.prefixed_key(key, complete=read_data)
if pkey in self.cache:
@ -1228,7 +1236,7 @@ class RepositoryCache(RepositoryNoCache):
else:
# slow path: eviction during this get_many removed this key from the cache
t0 = time.perf_counter()
data = self.repository.get(key, read_data=read_data)
data = self.repository.get(key, read_data=read_data, raise_missing=raise_missing)
self.slow_lat += time.perf_counter() - t0
transformed = self.add_entry(key, data, cache, complete=read_data)
self.slow_misses += 1

View file

@ -425,7 +425,7 @@ class Repository:
# note: do not collect the marker id
return result
def get(self, id, read_data=True):
def get(self, id, read_data=True, raise_missing=True):
self._lock_refresh()
id_hex = bin_to_hex(id)
key = "data/" + id_hex
@ -452,11 +452,14 @@ class Repository:
raise IntegrityError(f"Object too small [id {id_hex}]: expected {meta_size}, got {len(meta)} bytes")
return hdr + meta
except StoreObjectNotFound:
raise self.ObjectNotFound(id, str(self._location)) from None
if raise_missing:
raise self.ObjectNotFound(id, str(self._location)) from None
else:
return None
def get_many(self, ids, read_data=True, is_preloaded=False):
def get_many(self, ids, read_data=True, is_preloaded=False, raise_missing=True):
for id_ in ids:
yield self.get(id_, read_data=read_data)
yield self.get(id_, read_data=read_data, raise_missing=raise_missing)
def put(self, id, data, wait=True):
"""put a repo object

View file

@ -155,28 +155,19 @@ def test_missing_file_chunk(archivers, request):
else:
pytest.fail("should not happen") # convert 'fail'
cmd(archiver, "check", exit_code=1)
output = cmd(archiver, "check", exit_code=1)
assert "Missing file chunk detected" in output
output = cmd(archiver, "check", "--repair", exit_code=0)
assert "New missing file chunk detected" in output
assert "Missing file chunk detected" in output # repair is not changing anything, just reporting.
cmd(archiver, "check", exit_code=0)
output = cmd(archiver, "list", "archive1", "--format={health}#{path}{NL}", exit_code=0)
assert "broken#" in output
# check that the file in the old archives has now a different chunk list without the killed chunk.
# also check that the correct original chunks list is preserved in item.chunks_healthy.
# check does not modify the chunks list.
for archive_name in ("archive1", "archive2"):
archive, repository = open_archive(archiver.repository_path, archive_name)
with repository:
for item in archive.iter_items():
if item.path.endswith(src_file):
assert len(valid_chunks) == len(item.chunks)
assert killed_chunk not in item.chunks
assert valid_chunks != item.chunks
assert "chunks_healthy" in item
assert len(valid_chunks) == len(item.chunks_healthy)
assert killed_chunk in item.chunks_healthy
assert valid_chunks == item.chunks_healthy
assert valid_chunks == item.chunks
break
else:
pytest.fail("should not happen") # convert 'fail'
@ -185,32 +176,9 @@ def test_missing_file_chunk(archivers, request):
with patch.object(ChunkBuffer, "BUFFER_SIZE", 10):
create_src_archive(archiver, "archive3")
# check should be able to heal the file now:
# check should not complain anymore about missing chunks:
output = cmd(archiver, "check", "-v", "--repair", exit_code=0)
assert "Healed previously missing file chunk" in output
assert f"{src_file}: Completely healed previously damaged file!" in output
# check that the file in the old archives has the correct chunks again.
# also check that chunks_healthy list is removed as it is not needed any more.
for archive_name in ("archive1", "archive2"):
archive, repository = open_archive(archiver.repository_path, archive_name)
with repository:
for item in archive.iter_items():
if item.path.endswith(src_file):
assert valid_chunks == item.chunks
assert "chunks_healthy" not in item
break
else:
pytest.fail("should not happen")
# list is also all-healthy again
output = cmd(archiver, "list", "archive1", "--format={health}#{path}{NL}", exit_code=0)
assert "broken#" not in output
# check should be fine now (and not show it has healed anything).
output = cmd(archiver, "check", "-v", "--repair", exit_code=0)
assert "Healed previously missing file chunk" not in output
assert "testsuite/archiver.py: Completely healed previously damaged file!" not in output
assert "Missing file chunk detected" not in output
def test_missing_archive_item_chunk(archivers, request):
@ -425,13 +393,14 @@ def test_verify_data(archivers, request, init_args):
output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
assert f"{bin_to_hex(chunk.id)}, integrity error" in output
# repair (heal is tested in another test)
# repair will find the defect chunk and remove it
output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0)
assert f"{bin_to_hex(chunk.id)}, integrity error" in output
assert f"{src_file}: New missing file chunk detected" in output
assert f"{src_file}: Missing file chunk detected" in output
# run with --verify-data again, all fine now (file was patched with a replacement chunk).
cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=0)
# run with --verify-data again, it will notice the missing chunk.
output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
assert f"{src_file}: Missing file chunk detected" in output
@pytest.mark.parametrize("init_args", [["--encryption=repokey-aes-ocb"], ["--encryption", "none"]])
@ -457,13 +426,15 @@ def test_corrupted_file_chunk(archivers, request, init_args):
output = cmd(archiver, "check", "--repository-only", exit_code=1)
assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
# repair (heal is tested in another test)
# repair: the defect chunk will be removed by repair.
output = cmd(archiver, "check", "--repair", exit_code=0)
assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
assert f"{src_file}: New missing file chunk detected" in output
assert f"{src_file}: Missing file chunk detected" in output
# run normal check again, all fine now (file was patched with a replacement chunk).
# run normal check again
cmd(archiver, "check", "--repository-only", exit_code=0)
output = cmd(archiver, "check", "--archives-only", exit_code=1)
assert f"{src_file}: Missing file chunk detected" in output
def test_empty_repository(archivers, request):

View file

@ -9,7 +9,7 @@ import pytest
from ... import xattr
from ...chunker import has_seek_hole
from ...constants import * # NOQA
from ...helpers import EXIT_WARNING, BackupPermissionError
from ...helpers import EXIT_WARNING, BackupPermissionError, bin_to_hex
from ...helpers import flags_noatime, flags_normal
from .. import changedir, same_ts_ns
from .. import are_symlinks_supported, are_hardlinks_supported, is_utime_fully_supported, is_birthtime_fully_supported
@ -24,6 +24,9 @@ from . import (
_extract_hardlinks_setup,
assert_creates_file,
generate_archiver_tests,
create_src_archive,
open_archive,
src_file,
)
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA
@ -737,3 +740,22 @@ def test_dry_run_extraction_flags(archivers, request):
print(output)
assert not os.listdir("output"), "Output directory should be empty after dry-run"
def test_extract_file_with_missing_chunk(archivers, request):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
create_src_archive(archiver, "archive")
# Get rid of a chunk
archive, repository = open_archive(archiver.repository_path, "archive")
with repository:
for item in archive.iter_items():
if item.path.endswith(src_file):
chunk = item.chunks[-1]
repository.delete(chunk.id)
break
else:
assert False # missed the file
output = cmd(archiver, "extract", "archive")
# TODO: this is a bit dirty still: no warning/error rc, no filename output for the damaged file.
assert f"repository object {bin_to_hex(chunk.id)} missing, returning {chunk.size} zero bytes." in output

View file

@ -233,15 +233,19 @@ def test_fuse_allow_damaged_files(archivers, request):
break
else:
assert False # missed the file
cmd(archiver, "check", "--repair", exit_code=0)
mountpoint = os.path.join(archiver.tmpdir, "mountpoint")
with fuse_mount(archiver, mountpoint, "-a", "archive"):
with pytest.raises(OSError) as excinfo:
open(os.path.join(mountpoint, "archive", path))
assert excinfo.value.errno == errno.EIO
with open(os.path.join(mountpoint, "archive", path), "rb") as f:
with pytest.raises(OSError) as excinfo:
f.read()
assert excinfo.value.errno == errno.EIO
with fuse_mount(archiver, mountpoint, "-a", "archive", "-o", "allow_damaged_files"):
open(os.path.join(mountpoint, "archive", path)).close()
with open(os.path.join(mountpoint, "archive", path), "rb") as f:
# no exception raised, missing data will be all-zero
data = f.read()
assert data.endswith(b"\0\0")
@pytest.mark.skipif(not llfuse, reason="llfuse not installed")

View file

@ -164,7 +164,6 @@ def test_transfer_upgrade(archivers, request):
# fix expectation for size
e["size"] = g["size"]
# Note: size == 0 for all items without a size or chunks list (like e.g. directories)
# Note: healthy == True indicates the *absence* of the additional chunks_healthy list
del g["hlid"]
# borg 1 used "linktarget" and "source" for links, borg 2 uses "target" for symlinks.
@ -177,6 +176,9 @@ def test_transfer_upgrade(archivers, request):
# The S_IFBLK macro is broken on MINGW
del e["type"], g["type"]
del e["mode"], g["mode"]
del e["healthy"] # not supported anymore
assert g == e
if name == "archive1":

View file

@ -48,7 +48,7 @@ class UpgraderFrom12To20:
def new_archive(self, *, archive):
self.archive = archive
self.hlm = HardLinkManager(id_type=bytes, info_type=tuple) # hlid -> (chunks, chunks_healthy)
self.hlm = HardLinkManager(id_type=bytes, info_type=list) # hlid -> chunks_correct
def upgrade_item(self, *, item):
"""upgrade item as needed, get rid of legacy crap"""
@ -56,7 +56,6 @@ class UpgraderFrom12To20:
"path",
"rdev",
"chunks",
"chunks_healthy",
"hlid",
"mode",
"user",
@ -78,16 +77,14 @@ class UpgraderFrom12To20:
if self.hlm.borg1_hardlink_master(item):
item.hlid = hlid = self.hlm.hardlink_id_from_path(item.path)
self.hlm.remember(id=hlid, info=(item.get("chunks"), item.get("chunks_healthy")))
self.hlm.remember(id=hlid, info=item.get("chunks"))
elif self.hlm.borg1_hardlink_slave(item):
item.hlid = hlid = self.hlm.hardlink_id_from_path(item.source)
chunks, chunks_healthy = self.hlm.retrieve(id=hlid, default=(None, None))
chunks = self.hlm.retrieve(id=hlid)
if chunks is not None:
item.chunks = chunks
for chunk_id, chunk_size in chunks:
self.cache.reuse_chunk(chunk_id, chunk_size, self.archive.stats)
if chunks_healthy is not None:
item.chunks_healthy = chunks
del item.source # not used for hardlinks any more, replaced by hlid
# make sure we only have desired stuff in the new item. specifically, make sure to get rid of:
# - 'acl' remnants of bug in attic <= 0.13