From 29ebdbadae91d4573ea30c878d5e540993fb244f Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Tue, 12 Apr 2016 00:10:44 +0200 Subject: [PATCH] refcounting: use uint32_t, protect against overflows, fix merging for BE --- borg/_hashindex.c | 22 ----- borg/archive.py | 3 +- borg/cache.py | 8 +- borg/hashindex.pyx | 162 +++++++++++++++++++++++++++++------- borg/testsuite/hashindex.py | 162 ++++++++++++++++++++++++++++++++++++ 5 files changed, 300 insertions(+), 57 deletions(-) diff --git a/borg/_hashindex.c b/borg/_hashindex.c index f244aeaa0..4599fe0fa 100644 --- a/borg/_hashindex.c +++ b/borg/_hashindex.c @@ -439,25 +439,3 @@ hashindex_get_size(HashIndex *index) { return index->num_entries; } - -static void -hashindex_add(HashIndex *index, const void *key, int32_t *other_values) -{ - int32_t *my_values = (int32_t *)hashindex_get(index, key); - if(my_values == NULL) { - hashindex_set(index, key, other_values); - } else { - *my_values += *other_values; - } -} - -static void -hashindex_merge(HashIndex *index, HashIndex *other) -{ - int32_t key_size = index->key_size; - void *key = NULL; - - while((key = hashindex_next_key(other, key))) { - hashindex_add(index, key, key + key_size); - } -} diff --git a/borg/archive.py b/borg/archive.py index 83308e66d..167cc4711 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -743,8 +743,7 @@ class ArchiveChecker: def add_reference(id_, size, csize, cdata=None): try: - count, _, _ = self.chunks[id_] - self.chunks[id_] = count + 1, size, csize + self.chunks.incref(id_) except KeyError: assert cdata is not None self.chunks[id_] = 1, size, csize diff --git a/borg/cache.py b/borg/cache.py index 843903f3c..27826b46e 100644 --- a/borg/cache.py +++ b/borg/cache.py @@ -379,21 +379,19 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}""" def chunk_incref(self, id, stats): if not self.txn_active: self.begin_txn() - count, size, csize = self.chunks[id] - self.chunks[id] = (count + 1, size, csize) + count, size, csize = self.chunks.incref(id) stats.update(size, csize, False) return id, size, csize def chunk_decref(self, id, stats): if not self.txn_active: self.begin_txn() - count, size, csize = self.chunks[id] - if count == 1: + count, size, csize = self.chunks.decref(id) + if count == 0: del self.chunks[id] self.repository.delete(id, wait=False) stats.update(-size, -csize, True) else: - self.chunks[id] = (count - 1, size, csize) stats.update(-size, -csize, False) def file_known_and_unchanged(self, path_hash, st, ignore_inode=False): diff --git a/borg/hashindex.pyx b/borg/hashindex.pyx index 890e83a3f..fd1297ad6 100644 --- a/borg/hashindex.pyx +++ b/borg/hashindex.pyx @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- import os +cimport cython +from libc.stdint cimport uint32_t, UINT32_MAX, uint64_t + API_VERSION = 2 @@ -19,13 +22,34 @@ cdef extern from "_hashindex.c": void *hashindex_next_key(HashIndex *index, void *key) int hashindex_delete(HashIndex *index, void *key) int hashindex_set(HashIndex *index, void *key, void *value) - int _htole32(int v) - int _le32toh(int v) + uint32_t _htole32(uint32_t v) + uint32_t _le32toh(uint32_t v) cdef _NoDefault = object() -cimport cython +""" +The HashIndex is *not* a general purpose data structure. The value size must be at least 4 bytes, and these +first bytes are used for in-band signalling in the data structure itself. + +The constant MAX_VALUE defines the valid range for these 4 bytes when interpreted as an uint32_t from 0 +to MAX_VALUE (inclusive). The following reserved values beyond MAX_VALUE are currently in use +(byte order is LE):: + + 0xffffffff marks empty entries in the hashtable + 0xfffffffe marks deleted entries in the hashtable + +None of the publicly available classes in this module will accept nor return a reserved value; +AssertionError is raised instead. +""" + +assert UINT32_MAX == 2**32-1 + +# module-level constant because cdef's in classes can't have default values +cdef uint32_t _MAX_VALUE = 2**32-1025 +MAX_VALUE = _MAX_VALUE + +assert _MAX_VALUE % 2 == 1 @cython.internal cdef class IndexBase: @@ -98,22 +122,30 @@ cdef class NSIndex(IndexBase): def __getitem__(self, key): assert len(key) == self.key_size - data = hashindex_get(self.index, key) + data = hashindex_get(self.index, key) if not data: - raise KeyError - return _le32toh(data[0]), _le32toh(data[1]) + raise KeyError(key) + cdef uint32_t segment = _le32toh(data[0]) + assert segment <= _MAX_VALUE, "maximum number of segments reached" + return segment, _le32toh(data[1]) def __setitem__(self, key, value): assert len(key) == self.key_size - cdef int[2] data - data[0] = _htole32(value[0]) + cdef uint32_t[2] data + cdef uint32_t segment = value[0] + assert segment <= _MAX_VALUE, "maximum number of segments reached" + data[0] = _htole32(segment) data[1] = _htole32(value[1]) if not hashindex_set(self.index, key, data): raise Exception('hashindex_set failed') def __contains__(self, key): + cdef uint32_t segment assert len(key) == self.key_size - data = hashindex_get(self.index, key) + data = hashindex_get(self.index, key) + if data != NULL: + segment = _le32toh(data[0]) + assert segment <= _MAX_VALUE, "maximum number of segments reached" return data != NULL def iteritems(self, marker=None): @@ -146,25 +178,46 @@ cdef class NSKeyIterator: self.key = hashindex_next_key(self.index, self.key) if not self.key: raise StopIteration - cdef int *value = (self.key + self.key_size) - return (self.key)[:self.key_size], (_le32toh(value[0]), _le32toh(value[1])) + cdef uint32_t *value = (self.key + self.key_size) + cdef uint32_t segment = _le32toh(value[0]) + assert segment <= _MAX_VALUE, "maximum number of segments reached" + return (self.key)[:self.key_size], (segment, _le32toh(value[1])) cdef class ChunkIndex(IndexBase): + """ + Mapping of 32 byte keys to (refcount, size, csize), which are all 32-bit unsigned. + + The reference count cannot overflow. If an overflow would occur, the refcount + is fixed to MAX_VALUE and will neither increase nor decrease by incref(), decref() + or add(). + + Prior signed 32-bit overflow is handled correctly for most cases: All values + from UINT32_MAX (2**32-1, inclusive) to MAX_VALUE (exclusive) are reserved and either + cause silent data loss (-1, -2) or will raise an AssertionError when accessed. + Other values are handled correctly. Note that previously the refcount could also reach + 0 by *increasing* it. + + Assigning refcounts in this reserved range is an invalid operation and raises AssertionError. + """ value_size = 12 def __getitem__(self, key): assert len(key) == self.key_size - data = hashindex_get(self.index, key) + data = hashindex_get(self.index, key) if not data: - raise KeyError - return _le32toh(data[0]), _le32toh(data[1]), _le32toh(data[2]) + raise KeyError(key) + cdef uint32_t refcount = _le32toh(data[0]) + assert refcount <= _MAX_VALUE + return refcount, _le32toh(data[1]), _le32toh(data[2]) def __setitem__(self, key, value): assert len(key) == self.key_size - cdef int[3] data - data[0] = _htole32(value[0]) + cdef uint32_t[3] data + cdef uint32_t refcount = value[0] + assert refcount <= _MAX_VALUE, "invalid reference count" + data[0] = _htole32(refcount) data[1] = _htole32(value[1]) data[2] = _htole32(value[2]) if not hashindex_set(self.index, key, data): @@ -172,9 +225,38 @@ cdef class ChunkIndex(IndexBase): def __contains__(self, key): assert len(key) == self.key_size - data = hashindex_get(self.index, key) + data = hashindex_get(self.index, key) + if data != NULL: + assert data[0] <= _MAX_VALUE return data != NULL + def incref(self, key): + """Increase refcount for 'key', return (refcount, size, csize)""" + assert len(key) == self.key_size + data = hashindex_get(self.index, key) + if not data: + raise KeyError(key) + cdef uint32_t refcount = _le32toh(data[0]) + assert refcount <= _MAX_VALUE, "invalid reference count" + if refcount != _MAX_VALUE: + refcount += 1 + data[0] = _htole32(refcount) + return refcount, _le32toh(data[1]), _le32toh(data[2]) + + def decref(self, key): + """Decrease refcount for 'key', return (refcount, size, csize)""" + assert len(key) == self.key_size + data = hashindex_get(self.index, key) + if not data: + raise KeyError(key) + cdef uint32_t refcount = _le32toh(data[0]) + # Never decrease a reference count of zero + assert 0 < refcount <= _MAX_VALUE, "invalid reference count" + if refcount != _MAX_VALUE: + refcount -= 1 + data[0] = _htole32(refcount) + return refcount, _le32toh(data[1]), _le32toh(data[2]) + def iteritems(self, marker=None): cdef const void *key iter = ChunkKeyIterator(self.key_size) @@ -188,8 +270,9 @@ cdef class ChunkIndex(IndexBase): return iter def summarize(self): - cdef long long size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0 - cdef int *values + cdef uint64_t size = 0, csize = 0, unique_size = 0, unique_csize = 0, chunks = 0, unique_chunks = 0 + cdef uint32_t *values + cdef uint32_t refcount cdef void *key = NULL while True: @@ -197,25 +280,46 @@ cdef class ChunkIndex(IndexBase): if not key: break unique_chunks += 1 - values = (key + self.key_size) - chunks += _le32toh(values[0]) + values = (key + self.key_size) + refcount = _le32toh(values[0]) + assert refcount <= MAX_VALUE, "invalid reference count" + chunks += refcount unique_size += _le32toh(values[1]) unique_csize += _le32toh(values[2]) - size += _le32toh(values[1]) * _le32toh(values[0]) - csize += _le32toh(values[2]) * _le32toh(values[0]) + size += _le32toh(values[1]) * _le32toh(values[0]) + csize += _le32toh(values[2]) * _le32toh(values[0]) return size, csize, unique_size, unique_csize, unique_chunks, chunks def add(self, key, refs, size, csize): assert len(key) == self.key_size - cdef int[3] data + cdef uint32_t[3] data data[0] = _htole32(refs) data[1] = _htole32(size) data[2] = _htole32(csize) - hashindex_add(self.index, key, data) + self._add( key, data) + + cdef _add(self, void *key, uint32_t *data): + cdef uint64_t refcount1, refcount2, result64 + values = hashindex_get(self.index, key) + if values: + refcount1 = _le32toh(values[0]) + refcount2 = _le32toh(data[0]) + assert refcount1 <= _MAX_VALUE + assert refcount2 <= _MAX_VALUE + result64 = refcount1 + refcount2 + values[0] = _htole32(min(result64, _MAX_VALUE)) + else: + hashindex_set(self.index, key, data) def merge(self, ChunkIndex other): - hashindex_merge(self.index, other.index) + cdef void *key = NULL + + while True: + key = hashindex_next_key(other.index, key) + if not key: + break + self._add(key, (key + self.key_size)) cdef class ChunkKeyIterator: @@ -235,5 +339,7 @@ cdef class ChunkKeyIterator: self.key = hashindex_next_key(self.index, self.key) if not self.key: raise StopIteration - cdef int *value = (self.key + self.key_size) - return (self.key)[:self.key_size], (_le32toh(value[0]), _le32toh(value[1]), _le32toh(value[2])) + cdef uint32_t *value = (self.key + self.key_size) + cdef uint32_t refcount = _le32toh(value[0]) + assert refcount <= MAX_VALUE, "invalid reference count" + return (self.key)[:self.key_size], (refcount, _le32toh(value[1]), _le32toh(value[2])) diff --git a/borg/testsuite/hashindex.py b/borg/testsuite/hashindex.py index 019f7eea9..fba146952 100644 --- a/borg/testsuite/hashindex.py +++ b/borg/testsuite/hashindex.py @@ -1,8 +1,13 @@ +import base64 import hashlib import os +import struct import tempfile +import zlib +import pytest from ..hashindex import NSIndex, ChunkIndex +from .. import hashindex from . import BaseTestCase @@ -114,3 +119,160 @@ class HashIndexTestCase(BaseTestCase): assert unique_csize == 100 + 200 + 300 assert chunks == 1 + 2 + 3 assert unique_chunks == 3 + + +class HashIndexRefcountingTestCase(BaseTestCase): + def test_chunkindex_limit(self): + idx = ChunkIndex() + idx[H(1)] = hashindex.MAX_VALUE - 1, 1, 2 + + # 5 is arbitray, any number of incref/decrefs shouldn't move it once it's limited + for i in range(5): + # first incref to move it to the limit + refcount, *_ = idx.incref(H(1)) + assert refcount == hashindex.MAX_VALUE + for i in range(5): + refcount, *_ = idx.decref(H(1)) + assert refcount == hashindex.MAX_VALUE + + def _merge(self, refcounta, refcountb): + def merge(refcount1, refcount2): + idx1 = ChunkIndex() + idx1[H(1)] = refcount1, 1, 2 + idx2 = ChunkIndex() + idx2[H(1)] = refcount2, 1, 2 + idx1.merge(idx2) + refcount, *_ = idx1[H(1)] + return refcount + result = merge(refcounta, refcountb) + # check for commutativity + assert result == merge(refcountb, refcounta) + return result + + def test_chunkindex_merge_limit1(self): + # Check that it does *not* limit at MAX_VALUE - 1 + # (MAX_VALUE is odd) + half = hashindex.MAX_VALUE // 2 + assert self._merge(half, half) == hashindex.MAX_VALUE - 1 + + def test_chunkindex_merge_limit2(self): + # 3000000000 + 2000000000 > MAX_VALUE + assert self._merge(3000000000, 2000000000) == hashindex.MAX_VALUE + + def test_chunkindex_merge_limit3(self): + # Crossover point: both addition and limit semantics will yield the same result + half = hashindex.MAX_VALUE // 2 + assert self._merge(half + 1, half) == hashindex.MAX_VALUE + + def test_chunkindex_merge_limit4(self): + # Beyond crossover, result of addition would be 2**31 + half = hashindex.MAX_VALUE // 2 + assert self._merge(half + 2, half) == hashindex.MAX_VALUE + assert self._merge(half + 1, half + 1) == hashindex.MAX_VALUE + + def test_chunkindex_add(self): + idx1 = ChunkIndex() + idx1.add(H(1), 5, 6, 7) + assert idx1[H(1)] == (5, 6, 7) + idx1.add(H(1), 1, 0, 0) + assert idx1[H(1)] == (6, 6, 7) + + def test_incref_limit(self): + idx1 = ChunkIndex() + idx1[H(1)] = (hashindex.MAX_VALUE, 6, 7) + idx1.incref(H(1)) + refcount, *_ = idx1[H(1)] + assert refcount == hashindex.MAX_VALUE + + def test_decref_limit(self): + idx1 = ChunkIndex() + idx1[H(1)] = hashindex.MAX_VALUE, 6, 7 + idx1.decref(H(1)) + refcount, *_ = idx1[H(1)] + assert refcount == hashindex.MAX_VALUE + + def test_decref_zero(self): + idx1 = ChunkIndex() + idx1[H(1)] = 0, 0, 0 + with pytest.raises(AssertionError): + idx1.decref(H(1)) + + def test_incref_decref(self): + idx1 = ChunkIndex() + idx1.add(H(1), 5, 6, 7) + assert idx1[H(1)] == (5, 6, 7) + idx1.incref(H(1)) + assert idx1[H(1)] == (6, 6, 7) + idx1.decref(H(1)) + assert idx1[H(1)] == (5, 6, 7) + + def test_setitem_raises(self): + idx1 = ChunkIndex() + with pytest.raises(AssertionError): + idx1[H(1)] = hashindex.MAX_VALUE + 1, 0, 0 + + def test_keyerror(self): + idx = ChunkIndex() + with pytest.raises(KeyError): + idx.incref(H(1)) + with pytest.raises(KeyError): + idx.decref(H(1)) + with pytest.raises(KeyError): + idx[H(1)] + with pytest.raises(OverflowError): + idx.add(H(1), -1, 0, 0) + + +class HashIndexDataTestCase(BaseTestCase): + # This bytestring was created with 1.0-maint at c2f9533 + HASHINDEX = b'eJzt0L0NgmAUhtHLT0LDEI6AuAEhMVYmVnSuYefC7AB3Aj9KNedJbnfyFne6P67P27w0EdG1Eac+Cm1ZybAsy7Isy7Isy7Isy7I' \ + b'sy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7Isy7LsL9nhc+cqTZ' \ + b'3XlO2Ys++Du5fX+l1/YFmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVmWZVn2/+0O2rYccw==' + + def _serialize_hashindex(self, idx): + with tempfile.TemporaryDirectory() as tempdir: + file = os.path.join(tempdir, 'idx') + idx.write(file) + with open(file, 'rb') as f: + return self._pack(f.read()) + + def _deserialize_hashindex(self, bytestring): + with tempfile.TemporaryDirectory() as tempdir: + file = os.path.join(tempdir, 'idx') + with open(file, 'wb') as f: + f.write(self._unpack(bytestring)) + return ChunkIndex.read(file) + + def _pack(self, bytestring): + return base64.b64encode(zlib.compress(bytestring)) + + def _unpack(self, bytestring): + return zlib.decompress(base64.b64decode(bytestring)) + + def test_identical_creation(self): + idx1 = ChunkIndex() + idx1[H(1)] = 1, 2, 3 + idx1[H(2)] = 2**31 - 1, 0, 0 + idx1[H(3)] = 4294962296, 0, 0 # 4294962296 is -5000 interpreted as an uint32_t + + assert self._serialize_hashindex(idx1) == self.HASHINDEX + + def test_read_known_good(self): + idx1 = self._deserialize_hashindex(self.HASHINDEX) + assert idx1[H(1)] == (1, 2, 3) + assert idx1[H(2)] == (2**31 - 1, 0, 0) + assert idx1[H(3)] == (4294962296, 0, 0) + + idx2 = ChunkIndex() + idx2[H(3)] = 2**32 - 123456, 6, 7 + idx1.merge(idx2) + assert idx1[H(3)] == (hashindex.MAX_VALUE, 0, 0) + + +def test_nsindex_segment_limit(): + idx = NSIndex() + with pytest.raises(AssertionError): + idx[H(1)] = hashindex.MAX_VALUE + 1, 0 + assert H(1) not in idx + idx[H(2)] = hashindex.MAX_VALUE, 0 + assert H(2) in idx