diff --git a/modules/vector-sets/hnsw.c b/modules/vector-sets/hnsw.c index 2b4ebc0e9..05b53dd03 100644 --- a/modules/vector-sets/hnsw.c +++ b/modules/vector-sets/hnsw.c @@ -64,8 +64,8 @@ #endif #if defined (HAVE_AVX512) -#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,fma"))) -#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f")) +#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,avx512bw,fma"))) +#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) #else #define ATTRIBUTE_TARGET_AVX512 #define VSET_USE_AVX512 0 @@ -347,8 +347,155 @@ float vectors_distance_float(const float *x, const float *y, uint32_t dim) { } /* Q8 quants dotproduct. We do integer math and later fix it by range. */ +#if defined(HAVE_AVX512) +/* AVX512 optimized dot product for Q8 vectors */ +ATTRIBUTE_TARGET_AVX512 +float vectors_distance_q8_avx512(const int8_t *x, const int8_t *y, uint32_t dim, + float range_a, float range_b) { + // Handle zero vectors special case. + if (range_a == 0 || range_b == 0) { + return 1.0f; + } + + const float scale_product = (range_a/127) * (range_b/127); + __m512i sum = _mm512_setzero_si512(); + uint32_t i; + + /* Process 64 int8 elements at a time with AVX512 */ + for (i = 0; i + 63 < dim; i += 64) { + /* Load 64 int8 values */ + __m512i vx = _mm512_loadu_si512((__m512i*)&x[i]); + __m512i vy = _mm512_loadu_si512((__m512i*)&y[i]); + + /* Unpack and multiply-add in 32-bit precision + * This is done in two steps: lower 32 bytes and upper 32 bytes */ + + /* Process lower 32 bytes (256 bits) */ + __m256i vx_lo = _mm512_extracti64x4_epi64(vx, 0); + __m256i vy_lo = _mm512_extracti64x4_epi64(vy, 0); + + /* Extend int8 to int16 */ + __m512i vx_lo_16 = _mm512_cvtepi8_epi16(vx_lo); + __m512i vy_lo_16 = _mm512_cvtepi8_epi16(vy_lo); + + /* Multiply and accumulate to int32 */ + __m512i prod_lo = _mm512_madd_epi16(vx_lo_16, vy_lo_16); + sum = _mm512_add_epi32(sum, prod_lo); + + /* Process upper 32 bytes (256 bits) */ + __m256i vx_hi = _mm512_extracti64x4_epi64(vx, 1); + __m256i vy_hi = _mm512_extracti64x4_epi64(vy, 1); + + __m512i vx_hi_16 = _mm512_cvtepi8_epi16(vx_hi); + __m512i vy_hi_16 = _mm512_cvtepi8_epi16(vy_hi); + + __m512i prod_hi = _mm512_madd_epi16(vx_hi_16, vy_hi_16); + sum = _mm512_add_epi32(sum, prod_hi); + } + + /* Horizontal sum of the 16 int32 elements in sum */ + int32_t dot = _mm512_reduce_add_epi32(sum); + + /* Handle remaining elements */ + for (; i < dim; i++) { + dot += ((int32_t)x[i]) * ((int32_t)y[i]); + } + + /* Convert to original range */ + float dotf = dot * scale_product; + float distance = 1.0f - dotf; + + /* Clamp distance to [0, 2] */ + if (distance < 0) distance = 0; + else if (distance > 2) distance = 2; + return distance; +} +#endif /* HAVE_AVX512 */ + +#if defined(HAVE_AVX2) +/* AVX2 optimized dot product for Q8 vectors */ +ATTRIBUTE_TARGET_AVX2 +float vectors_distance_q8_avx2(const int8_t *x, const int8_t *y, uint32_t dim, + float range_a, float range_b) { + // Handle zero vectors special case. + if (range_a == 0 || range_b == 0) { + return 1.0f; + } + + const float scale_product = (range_a/127) * (range_b/127); + __m256i sum = _mm256_setzero_si256(); + uint32_t i; + + /* Process 32 int8 elements at a time with AVX2 */ + for (i = 0; i + 31 < dim; i += 32) { + /* Load 32 int8 values */ + __m256i vx = _mm256_loadu_si256((__m256i*)&x[i]); + __m256i vy = _mm256_loadu_si256((__m256i*)&y[i]); + + /* Split into lower and upper 16 bytes */ + __m128i vx_lo = _mm256_extracti128_si256(vx, 0); + __m128i vy_lo = _mm256_extracti128_si256(vy, 0); + __m128i vx_hi = _mm256_extracti128_si256(vx, 1); + __m128i vy_hi = _mm256_extracti128_si256(vy, 1); + + /* Extend int8 to int16 for lower half */ + __m256i vx_lo_16 = _mm256_cvtepi8_epi16(vx_lo); + __m256i vy_lo_16 = _mm256_cvtepi8_epi16(vy_lo); + + /* Multiply and accumulate (madd does multiply adjacent pairs and add) */ + __m256i prod_lo = _mm256_madd_epi16(vx_lo_16, vy_lo_16); + sum = _mm256_add_epi32(sum, prod_lo); + + /* Extend int8 to int16 for upper half */ + __m256i vx_hi_16 = _mm256_cvtepi8_epi16(vx_hi); + __m256i vy_hi_16 = _mm256_cvtepi8_epi16(vy_hi); + + __m256i prod_hi = _mm256_madd_epi16(vx_hi_16, vy_hi_16); + sum = _mm256_add_epi32(sum, prod_hi); + } + + /* Horizontal sum of the 8 int32 elements in sum */ + __m128i sum_hi = _mm256_extracti128_si256(sum, 1); + __m128i sum_lo = _mm256_castsi256_si128(sum); + __m128i sum_128 = _mm_add_epi32(sum_hi, sum_lo); + + sum_128 = _mm_hadd_epi32(sum_128, sum_128); + sum_128 = _mm_hadd_epi32(sum_128, sum_128); + + int32_t dot = _mm_cvtsi128_si32(sum_128); + + /* Handle remaining elements */ + for (; i < dim; i++) { + dot += ((int32_t)x[i]) * ((int32_t)y[i]); + } + + /* Convert to original range */ + float dotf = dot * scale_product; + float distance = 1.0f - dotf; + + /* Clamp distance to [0, 2] */ + if (distance < 0) distance = 0; + else if (distance > 2) distance = 2; + return distance; +} +#endif /* HAVE_AVX2 */ + +/* Q8 dot product: automatically selects best available implementation */ float vectors_distance_q8(const int8_t *x, const int8_t *y, uint32_t dim, float range_a, float range_b) { +#if defined(HAVE_AVX512) + if (dim >= 64 && VSET_USE_AVX512) { + return vectors_distance_q8_avx512(x, y, dim, range_a, range_b); + } +#endif + +#if defined(HAVE_AVX2) + if (dim >= 32 && VSET_USE_AVX2) { + return vectors_distance_q8_avx2(x, y, dim, range_a, range_b); + } +#endif + + /* Fallback to scalar implementation */ // Handle zero vectors special case. if (range_a == 0 || range_b == 0) { /* Zero vector distance from anything is 1.0 diff --git a/modules/vector-sets/tests/q8_similarity.py b/modules/vector-sets/tests/q8_similarity.py new file mode 100644 index 000000000..e0d532dae --- /dev/null +++ b/modules/vector-sets/tests/q8_similarity.py @@ -0,0 +1,71 @@ +from test import TestCase + +class Q8Similarity(TestCase): + def getname(self): + return "Q8 quantization: VSIM reported distance makes sense with 4D vectors" + + def test(self): + # Add two very similar vectors, one different + # Using same test vectors as basic_similarity.py for comparison + vec1 = [1, 0, 0, 0] + vec2 = [0.99, 0.01, 0, 0] + vec3 = [0.1, 1, -1, 0.5] + + # Add vectors using VALUES format with Q8 quantization + self.redis.execute_command('VADD', self.test_key, 'VALUES', 4, + *[str(x) for x in vec1], f'{self.test_key}:item:1', 'Q8') + self.redis.execute_command('VADD', self.test_key, 'VALUES', 4, + *[str(x) for x in vec2], f'{self.test_key}:item:2', 'Q8') + self.redis.execute_command('VADD', self.test_key, 'VALUES', 4, + *[str(x) for x in vec3], f'{self.test_key}:item:3', 'Q8') + + # Query similarity with vec1 + result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4, + *[str(x) for x in vec1], 'WITHSCORES') + + # Convert results to dictionary + results_dict = {} + for i in range(0, len(result), 2): + key = result[i].decode() + score = float(result[i+1]) + results_dict[key] = score + + # Verify results (same expectations as float32, allowing for quantization error) + assert results_dict[f'{self.test_key}:item:1'] > 0.99, "Self-similarity should be very high (Q8)" + assert results_dict[f'{self.test_key}:item:2'] > 0.99, "Similar vector should have high similarity (Q8)" + assert results_dict[f'{self.test_key}:item:3'] < 0.80, "Not very similar vector should have low similarity (Q8)" + + # Test extreme values with 512 dimensions to stress-test overflow safety + vec4 = [1.0] * 512 # All +127 after quantization + vec5 = [-1.0] * 512 # All -127 after quantization + vec6 = [1.0, -1.0] * 256 # Alternating +127, -127 + + # Add vectors using VALUES format with Q8 quantization + self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512, + *[str(x) for x in vec4], f'{self.test_key}:extreme:vec4', 'Q8') + self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512, + *[str(x) for x in vec5], f'{self.test_key}:extreme:vec5', 'Q8') + self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512, + *[str(x) for x in vec6], f'{self.test_key}:extreme:vec6', 'Q8') + + # Query vec4 against itself - worst-case positive accumulation (512 * 127 * 127 = 8,258,048) + result_vec4 = self.redis.execute_command('VSIM', f'{self.test_key}:extreme', 'VALUES', 512, + *[str(x) for x in vec4], 'WITHSCORES') + results_vec4 = {} + for i in range(0, len(result_vec4), 2): + key = result_vec4[i].decode() + score = float(result_vec4[i+1]) + results_vec4[key] = score + + # Verify extreme value handling + # VSIM returns similarity = 1.0 - distance/2.0, so: + # - Distance 0 (identical) → similarity 1.0 + # - Distance 2 (opposite) → similarity 0.0 + assert results_vec4[f'{self.test_key}:extreme:vec4'] > 0.999, \ + f"vec4 self-similarity should be very high, got {results_vec4[f'{self.test_key}:extreme:vec4']}" + assert results_vec4[f'{self.test_key}:extreme:vec5'] < 0.01, \ + f"vec4 vs vec5 (opposite extremes) should be near 0, got {results_vec4[f'{self.test_key}:extreme:vec5']}" + + # Alternating pattern should result in mid-range similarity (perpendicular) + assert 0.4 < results_vec4[f'{self.test_key}:extreme:vec6'] < 0.6, \ + f"vec4 vs vec6 (alternating) should be near 0.5, got {results_vec4[f'{self.test_key}:extreme:vec6']}" diff --git a/modules/vector-sets/tests/q8_vectorization.py b/modules/vector-sets/tests/q8_vectorization.py new file mode 100644 index 000000000..652c59cde --- /dev/null +++ b/modules/vector-sets/tests/q8_vectorization.py @@ -0,0 +1,85 @@ +from test import TestCase + +class Q8Vectorization(TestCase): + def getname(self): + return "Q8 quantization: verify vectorized vs scalar paths produce consistent results" + + def test(self): + # Test with different dimensions to exercise different code paths and boundaries: + # - dim=16: Scalar path (< 32) + # - dim=31: Largest scalar-only dimension (boundary) + # - dim=32: Smallest AVX2 dimension, no remainder (boundary) + # - dim=33: AVX2 with 1-element remainder + # - dim=63: AVX2 with 31-element remainder (largest AVX2-only) + # - dim=64: Smallest AVX512 dimension, no remainder (boundary) + # - dim=65: AVX512 with 1-element remainder + # - dim=128: AVX512 path with no remainder + # - dim=256, dim=512: Large dimensions to test overflow prevention + + test_dims = [16, 31, 32, 33, 63, 64, 65, 128, 256, 512] + + for dim in test_dims: + key = f'{self.test_key}:dim{dim}' + + # Test vectors with extreme values to verify overflow prevention: + # vec1: all +1.0 -> quantizes to +127 (max positive int8) + # vec2: all +0.99 -> quantizes to ~+126 (similar to vec1) + # vec3: all -1.0 -> quantizes to -127/-128 (max negative int8) + # vec4: alternating +1.0/-1.0 -> alternating +127/-127 (tests mixed signs) + vec1 = [1.0] * dim # All max positive + vec2 = [0.99] * dim # Similar to vec1 + vec3 = [-1.0] * dim # All max negative (opposite direction) + vec4 = [1.0 if i % 2 == 0 else -1.0 for i in range(dim)] # Alternating extreme values + + # Add vectors with Q8 quantization + self.redis.execute_command('VADD', key, 'VALUES', dim, + *[str(x) for x in vec1], f'{key}:item:1', 'Q8') + self.redis.execute_command('VADD', key, 'VALUES', dim, + *[str(x) for x in vec2], f'{key}:item:2', 'Q8') + self.redis.execute_command('VADD', key, 'VALUES', dim, + *[str(x) for x in vec3], f'{key}:item:3', 'Q8') + self.redis.execute_command('VADD', key, 'VALUES', dim, + *[str(x) for x in vec4], f'{key}:item:4', 'Q8') + + # Query similarity using vec1 (all max positive values) + # This exercises worst-case positive accumulation: dim * 127 * 127 + result = self.redis.execute_command('VSIM', key, 'VALUES', dim, + *[str(x) for x in vec1], 'WITHSCORES') + + # Convert results to dictionary + results_dict = {} + for i in range(0, len(result), 2): + k = result[i].decode() + score = float(result[i+1]) + results_dict[k] = score + + # Verify results - these would be wrong if overflow occurred + # Self-similarity should be ~1.0 (identical vectors) + assert results_dict[f'{key}:item:1'] > 0.99, \ + f"Dim {dim}: Self-similarity too low: {results_dict[f'{key}:item:1']}" + + # Similar vector should have high similarity + assert results_dict[f'{key}:item:2'] > 0.99, \ + f"Dim {dim}: Similar vector similarity too low: {results_dict[f'{key}:item:2']}" + + # Opposite vector should have very low similarity (~0.0) + # With overflow bug, this could give incorrect positive values + assert results_dict[f'{key}:item:3'] < 0.1, \ + f"Dim {dim}: Opposite vector similarity too high: {results_dict[f'{key}:item:3']}" + + # Alternating vector: dot product sums to ~0, so similarity ~0.5 + # (127*127) + (127*-127) + ... = 0, normalized gives ~0.5 + assert 0.4 < results_dict[f'{key}:item:4'] < 0.6, \ + f"Dim {dim}: Alternating vector similarity unexpected: {results_dict[f'{key}:item:4']}" + + # Also query with the alternating pattern to verify its self-similarity + result_alt = self.redis.execute_command('VSIM', key, 'VALUES', dim, + *[str(x) for x in vec4], 'WITHSCORES') + results_alt = {} + for i in range(0, len(result_alt), 2): + k = result_alt[i].decode() + score = float(result_alt[i+1]) + results_alt[k] = score + + assert results_alt[f'{key}:item:4'] > 0.99, \ + f"Dim {dim}: Alternating self-similarity too low: {results_alt[f'{key}:item:4']}"