diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4c2a5fde9..c0f36aa36 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -703,57 +703,3 @@ jobs: report_type: coverage env_vars: OS,python files: coverage.xml - - sha256_pack_id_tests: - name: sha256 pack-id (informational) - needs: [lint] - runs-on: ubuntu-24.04 - timeout-minutes: 90 - continue-on-error: true - concurrency: - group: sha256-pack-id-${{ github.head_ref || github.ref }} - cancel-in-progress: false - - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 0 - fetch-tags: true - - - name: Set up Python 3.12 - uses: actions/setup-python@v6 - with: - python-version: "3.12" - - - name: Cache pip - uses: actions/cache@v5 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-${{ runner.arch }}-pip-sha256-pack-id-${{ hashFiles('requirements.d/development.lock.txt') }} - restore-keys: | - ${{ runner.os }}-${{ runner.arch }}-pip- - - - name: Cache tox environments - uses: actions/cache@v5 - with: - path: .tox - key: ${{ runner.os }}-${{ runner.arch }}-tox-sha256-pack-id-${{ hashFiles('requirements.d/development.lock.txt', 'pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-${{ runner.arch }}-tox-sha256-pack-id- - - - name: Install Linux packages - run: | - sudo apt-get update - sudo apt-get install -y pkg-config build-essential - sudo apt-get install -y libssl-dev libacl1-dev liblz4-dev - - - name: Install Python requirements - run: | - python -m pip install --upgrade pip setuptools wheel - pip install -r requirements.d/development.lock.txt - - - name: Install borgbackup - run: pip install -ve ".[cockpit,s3,sftp,rclone]" - - - name: Run tests with sha256 pack-ids - run: tox -e sha256-pack-id diff --git a/docs/internals/packs.rst b/docs/internals/packs.rst index f5f555a3b..211e1cd93 100644 --- a/docs/internals/packs.rst +++ b/docs/internals/packs.rst @@ -71,12 +71,13 @@ Blobs follow one another contiguously with no padding:: Pack ID ~~~~~~~ -The pack ID equals the ``chunk_id`` of the blob it contains:: +The pack ID is the SHA-256 of the pack file's bytes:: - pack_id = chunk_id + pack_id = sha256(pack_bytes) -Since ``chunk_id`` is the ID hash of the plaintext, the filename commits to the -content. ``borg check`` can detect silent corruption without decrypting any blob. +Content-addressing the file by its own bytes makes the name commit to the +content, so borgstore can verify and cache it and ``borg check`` can detect +silent corruption of the stored file. Namespace ~~~~~~~~~ diff --git a/pyproject.toml b/pyproject.toml index c9e0ec8cc..683a46372 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -257,11 +257,6 @@ extras = ["pyfuse3", "sftp", "s3", "rclone"] set_env = {BORG_FUSE_IMPL = "mfusepy"} extras = ["mfusepy", "sftp", "s3", "rclone"] -# Informational env: forces sha256 pack_ids even with max_count=1 to expose -# code that still assumes pack_id == chunk_id. Run: tox -e sha256-pack-id -[tool.tox.env.sha256-pack-id] -set_env = {BORG_TESTONLY_SHA256_PACK_ID = "1"} - [tool.tox.env.ruff] skip_install = true deps = ["ruff"] diff --git a/src/borg/repository.py b/src/borg/repository.py index f44c5fd37..f6c10f731 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -115,9 +115,8 @@ class PackWriter: uses that repository's single, authoritative index (see the chunks property), so there is never a second copy to keep in sync. Unit tests pass an explicit index. - At max_count=1 (N=1 phase) each put() maps exactly one chunk to one pack. - Raising max_count later (N>1 phase) enables real packing without touching - this class's interface. + max_count bounds how many chunks a pack accumulates before flush() writes it. + Raising it produces larger packs without changing this class's interface. """ def __init__(self, store, *, max_count=1, chunks=None, repository=None): @@ -170,9 +169,8 @@ class PackWriter: # that incremental string concatenation would cause in Python). pack_data = b"".join(cdata for _, cdata in self._pieces) - # Name the pack by the hash of its bytes (content-addressing), independent of how many - # chunks it holds or what their ids are. This is why a single-chunk pack's name is not its - # chunk_id: the pack and the chunk are different objects with different identities. + # Name the pack by the SHA-256 of its bytes: the name commits to the stored content, + # so borgstore can verify and cache the file. pack_id = sha256(pack_data).digest() # Record (chunk_id, pack_id, obj_offset, obj_size) for every piece. @@ -659,9 +657,8 @@ class Repository: # add all existing objects to the index. # borg check: the index may have corrupted objects (we did not delete them) # borg check --repair: the index will only have non-corrupted objects. - # the pack file name is the pack_id (sha256(pack_bytes)), which is not the - # chunk_id, so recover each object's real (chunk_id, offset, size) from its - # on-disk header rather than assuming pack file name == chunk_id. + # the pack file name is the pack_id; each object's chunk_id, offset and size + # come from its on-disk header, so scan the headers to rebuild the index. pack_id = hex_to_bin(info.name) for chunk_id, obj_offset, obj_size in RepoObj.iter_object_headers(pack): chunks[chunk_id] = ChunkIndexEntry( diff --git a/src/borg/testsuite/repository_test.py b/src/borg/testsuite/repository_test.py index 3f7a683b3..e5e368c7e 100644 --- a/src/borg/testsuite/repository_test.py +++ b/src/borg/testsuite/repository_test.py @@ -223,7 +223,6 @@ def test_pack_writer_n1_flush(): stored_id, pack_id, obj_offset, obj_size = results[0] assert stored_id == chunk_id assert pack_id == sha256(cdata).digest() - assert pack_id != chunk_id assert obj_offset == 0 assert obj_size == len(cdata) @@ -343,7 +342,6 @@ def test_put_marks_id_in_chunk_index(tmp_path): entry = repository._chunks.get(id1) assert entry is not None assert entry.pack_id == sha256(fchunk(b"ZEROS")).digest() - assert entry.pack_id != id1 assert entry.size == 0 # uncompressed size filled in by cache layer