Vectorized the quantized 8-bit vector distance calculation (#14474)

This pull request vectorizes the 8-bit quantization vector-search path in a similar was as the non-quantization path. The assembly intrinsics are a bit more complicated than in the non-quantization path, since we are operating on 8-bit integers and we need to worry about preventing overflow. Thus, after loading the 8-bit integers, they are extended into 16-bits before multiplying and accumulating into 32-bit integers. --------- Co-authored-by: debing.sun <debing.sun@redis.com>
2026-02-03 20:39:54 -05:00 · 2026-01-26 18:25:59 -07:00 · 2026-01-26 18:25:59 -07:00 · 37f685908e
commit 37f685908e
parent 48aa1ce524
3 changed files with 305 additions and 2 deletions
--- a/modules/vector-sets/hnsw.c
+++ b/modules/vector-sets/hnsw.c
@ -64,8 +64,8 @@
 #endif

 #if defined (HAVE_AVX512)
-#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,fma")))
-#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f"))
+#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,avx512bw,fma")))
+#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw"))
 #else
 #define ATTRIBUTE_TARGET_AVX512
 #define VSET_USE_AVX512 0
@ -347,8 +347,155 @@ float vectors_distance_float(const float *x, const float *y, uint32_t dim) {
 }

 /* Q8 quants dotproduct. We do integer math and later fix it by range. */
+#if defined(HAVE_AVX512)
+/* AVX512 optimized dot product for Q8 vectors */
+ATTRIBUTE_TARGET_AVX512
+float vectors_distance_q8_avx512(const int8_t *x, const int8_t *y, uint32_t dim,
+                                  float range_a, float range_b) {
+    // Handle zero vectors special case.
+    if (range_a == 0 || range_b == 0) {
+        return 1.0f;
+    }
+
+    const float scale_product = (range_a/127) * (range_b/127);
+    __m512i sum = _mm512_setzero_si512();
+    uint32_t i;
+
+    /* Process 64 int8 elements at a time with AVX512 */
+    for (i = 0; i + 63 < dim; i += 64) {
+        /* Load 64 int8 values */
+        __m512i vx = _mm512_loadu_si512((__m512i*)&x[i]);
+        __m512i vy = _mm512_loadu_si512((__m512i*)&y[i]);
+        
+        /* Unpack and multiply-add in 32-bit precision
+         * This is done in two steps: lower 32 bytes and upper 32 bytes */
+        
+        /* Process lower 32 bytes (256 bits) */
+        __m256i vx_lo = _mm512_extracti64x4_epi64(vx, 0);
+        __m256i vy_lo = _mm512_extracti64x4_epi64(vy, 0);
+        
+        /* Extend int8 to int16 */
+        __m512i vx_lo_16 = _mm512_cvtepi8_epi16(vx_lo);
+        __m512i vy_lo_16 = _mm512_cvtepi8_epi16(vy_lo);
+        
+        /* Multiply and accumulate to int32 */
+        __m512i prod_lo = _mm512_madd_epi16(vx_lo_16, vy_lo_16);
+        sum = _mm512_add_epi32(sum, prod_lo);
+        
+        /* Process upper 32 bytes (256 bits) */
+        __m256i vx_hi = _mm512_extracti64x4_epi64(vx, 1);
+        __m256i vy_hi = _mm512_extracti64x4_epi64(vy, 1);
+        
+        __m512i vx_hi_16 = _mm512_cvtepi8_epi16(vx_hi);
+        __m512i vy_hi_16 = _mm512_cvtepi8_epi16(vy_hi);
+        
+        __m512i prod_hi = _mm512_madd_epi16(vx_hi_16, vy_hi_16);
+        sum = _mm512_add_epi32(sum, prod_hi);
+    }
+
+    /* Horizontal sum of the 16 int32 elements in sum */
+    int32_t dot = _mm512_reduce_add_epi32(sum);
+
+    /* Handle remaining elements */
+    for (; i < dim; i++) {
+        dot += ((int32_t)x[i]) * ((int32_t)y[i]);
+    }
+
+    /* Convert to original range */
+    float dotf = dot * scale_product;
+    float distance = 1.0f - dotf;
+
+    /* Clamp distance to [0, 2] */
+    if (distance < 0) distance = 0;
+    else if (distance > 2) distance = 2;
+    return distance;
+}
+#endif /* HAVE_AVX512 */
+
+#if defined(HAVE_AVX2)
+/* AVX2 optimized dot product for Q8 vectors */
+ATTRIBUTE_TARGET_AVX2
+float vectors_distance_q8_avx2(const int8_t *x, const int8_t *y, uint32_t dim,
+                                float range_a, float range_b) {
+    // Handle zero vectors special case.
+    if (range_a == 0 || range_b == 0) {
+        return 1.0f;
+    }
+
+    const float scale_product = (range_a/127) * (range_b/127);
+    __m256i sum = _mm256_setzero_si256();
+    uint32_t i;
+
+    /* Process 32 int8 elements at a time with AVX2 */
+    for (i = 0; i + 31 < dim; i += 32) {
+        /* Load 32 int8 values */
+        __m256i vx = _mm256_loadu_si256((__m256i*)&x[i]);
+        __m256i vy = _mm256_loadu_si256((__m256i*)&y[i]);
+        
+        /* Split into lower and upper 16 bytes */
+        __m128i vx_lo = _mm256_extracti128_si256(vx, 0);
+        __m128i vy_lo = _mm256_extracti128_si256(vy, 0);
+        __m128i vx_hi = _mm256_extracti128_si256(vx, 1);
+        __m128i vy_hi = _mm256_extracti128_si256(vy, 1);
+        
+        /* Extend int8 to int16 for lower half */
+        __m256i vx_lo_16 = _mm256_cvtepi8_epi16(vx_lo);
+        __m256i vy_lo_16 = _mm256_cvtepi8_epi16(vy_lo);
+        
+        /* Multiply and accumulate (madd does multiply adjacent pairs and add) */
+        __m256i prod_lo = _mm256_madd_epi16(vx_lo_16, vy_lo_16);
+        sum = _mm256_add_epi32(sum, prod_lo);
+        
+        /* Extend int8 to int16 for upper half */
+        __m256i vx_hi_16 = _mm256_cvtepi8_epi16(vx_hi);
+        __m256i vy_hi_16 = _mm256_cvtepi8_epi16(vy_hi);
+        
+        __m256i prod_hi = _mm256_madd_epi16(vx_hi_16, vy_hi_16);
+        sum = _mm256_add_epi32(sum, prod_hi);
+    }
+
+    /* Horizontal sum of the 8 int32 elements in sum */
+    __m128i sum_hi = _mm256_extracti128_si256(sum, 1);
+    __m128i sum_lo = _mm256_castsi256_si128(sum);
+    __m128i sum_128 = _mm_add_epi32(sum_hi, sum_lo);
+    
+    sum_128 = _mm_hadd_epi32(sum_128, sum_128);
+    sum_128 = _mm_hadd_epi32(sum_128, sum_128);
+    
+    int32_t dot = _mm_cvtsi128_si32(sum_128);
+
+    /* Handle remaining elements */
+    for (; i < dim; i++) {
+        dot += ((int32_t)x[i]) * ((int32_t)y[i]);
+    }
+
+    /* Convert to original range */
+    float dotf = dot * scale_product;
+    float distance = 1.0f - dotf;
+
+    /* Clamp distance to [0, 2] */
+    if (distance < 0) distance = 0;
+    else if (distance > 2) distance = 2;
+    return distance;
+}
+#endif /* HAVE_AVX2 */
+
+/* Q8 dot product: automatically selects best available implementation */
 float vectors_distance_q8(const int8_t *x, const int8_t *y, uint32_t dim,
                        float range_a, float range_b) {
+#if defined(HAVE_AVX512)
+    if (dim >= 64 && VSET_USE_AVX512) {
+        return vectors_distance_q8_avx512(x, y, dim, range_a, range_b);
+    }
+#endif
+
+#if defined(HAVE_AVX2)
+    if (dim >= 32 && VSET_USE_AVX2) {
+        return vectors_distance_q8_avx2(x, y, dim, range_a, range_b);
+    }
+#endif
+
+    /* Fallback to scalar implementation */
    // Handle zero vectors special case.
    if (range_a == 0 || range_b == 0) {
        /* Zero vector distance from anything is 1.0
--- a/modules/vector-sets/tests/q8_similarity.py
+++ b/modules/vector-sets/tests/q8_similarity.py
@ -0,0 +1,71 @@
+from test import TestCase
+
+class Q8Similarity(TestCase):
+    def getname(self):
+        return "Q8 quantization: VSIM reported distance makes sense with 4D vectors"
+
+    def test(self):
+        # Add two very similar vectors, one different
+        # Using same test vectors as basic_similarity.py for comparison
+        vec1 = [1, 0, 0, 0]
+        vec2 = [0.99, 0.01, 0, 0]
+        vec3 = [0.1, 1, -1, 0.5]
+
+        # Add vectors using VALUES format with Q8 quantization
+        self.redis.execute_command('VADD', self.test_key, 'VALUES', 4, 
+                                 *[str(x) for x in vec1], f'{self.test_key}:item:1', 'Q8')
+        self.redis.execute_command('VADD', self.test_key, 'VALUES', 4, 
+                                 *[str(x) for x in vec2], f'{self.test_key}:item:2', 'Q8')
+        self.redis.execute_command('VADD', self.test_key, 'VALUES', 4, 
+                                 *[str(x) for x in vec3], f'{self.test_key}:item:3', 'Q8')
+
+        # Query similarity with vec1
+        result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4, 
+                                          *[str(x) for x in vec1], 'WITHSCORES')
+
+        # Convert results to dictionary
+        results_dict = {}
+        for i in range(0, len(result), 2):
+            key = result[i].decode()
+            score = float(result[i+1])
+            results_dict[key] = score
+
+        # Verify results (same expectations as float32, allowing for quantization error)
+        assert results_dict[f'{self.test_key}:item:1'] > 0.99, "Self-similarity should be very high (Q8)"
+        assert results_dict[f'{self.test_key}:item:2'] > 0.99, "Similar vector should have high similarity (Q8)"
+        assert results_dict[f'{self.test_key}:item:3'] < 0.80, "Not very similar vector should have low similarity (Q8)"
+
+        # Test extreme values with 512 dimensions to stress-test overflow safety
+        vec4 = [1.0] * 512  # All +127 after quantization
+        vec5 = [-1.0] * 512  # All -127 after quantization
+        vec6 = [1.0, -1.0] * 256  # Alternating +127, -127
+
+        # Add vectors using VALUES format with Q8 quantization
+        self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512,
+                                 *[str(x) for x in vec4], f'{self.test_key}:extreme:vec4', 'Q8')
+        self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512,
+                                 *[str(x) for x in vec5], f'{self.test_key}:extreme:vec5', 'Q8')
+        self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512,
+                                 *[str(x) for x in vec6], f'{self.test_key}:extreme:vec6', 'Q8')
+
+        # Query vec4 against itself - worst-case positive accumulation (512 * 127 * 127 = 8,258,048)
+        result_vec4 = self.redis.execute_command('VSIM', f'{self.test_key}:extreme', 'VALUES', 512,
+                                               *[str(x) for x in vec4], 'WITHSCORES')
+        results_vec4 = {}
+        for i in range(0, len(result_vec4), 2):
+            key = result_vec4[i].decode()
+            score = float(result_vec4[i+1])
+            results_vec4[key] = score
+
+        # Verify extreme value handling
+        # VSIM returns similarity = 1.0 - distance/2.0, so:
+        # - Distance 0 (identical) → similarity 1.0
+        # - Distance 2 (opposite) → similarity 0.0
+        assert results_vec4[f'{self.test_key}:extreme:vec4'] > 0.999, \
+            f"vec4 self-similarity should be very high, got {results_vec4[f'{self.test_key}:extreme:vec4']}"
+        assert results_vec4[f'{self.test_key}:extreme:vec5'] < 0.01, \
+            f"vec4 vs vec5 (opposite extremes) should be near 0, got {results_vec4[f'{self.test_key}:extreme:vec5']}"
+        
+        # Alternating pattern should result in mid-range similarity (perpendicular)
+        assert 0.4 < results_vec4[f'{self.test_key}:extreme:vec6'] < 0.6, \
+            f"vec4 vs vec6 (alternating) should be near 0.5, got {results_vec4[f'{self.test_key}:extreme:vec6']}"
--- a/modules/vector-sets/tests/q8_vectorization.py
+++ b/modules/vector-sets/tests/q8_vectorization.py
@ -0,0 +1,85 @@
+from test import TestCase
+
+class Q8Vectorization(TestCase):
+    def getname(self):
+        return "Q8 quantization: verify vectorized vs scalar paths produce consistent results"
+
+    def test(self):
+        # Test with different dimensions to exercise different code paths and boundaries:
+        # - dim=16: Scalar path (< 32)
+        # - dim=31: Largest scalar-only dimension (boundary)
+        # - dim=32: Smallest AVX2 dimension, no remainder (boundary)
+        # - dim=33: AVX2 with 1-element remainder
+        # - dim=63: AVX2 with 31-element remainder (largest AVX2-only)
+        # - dim=64: Smallest AVX512 dimension, no remainder (boundary)
+        # - dim=65: AVX512 with 1-element remainder
+        # - dim=128: AVX512 path with no remainder
+        # - dim=256, dim=512: Large dimensions to test overflow prevention
+        
+        test_dims = [16, 31, 32, 33, 63, 64, 65, 128, 256, 512]
+        
+        for dim in test_dims:
+            key = f'{self.test_key}:dim{dim}'
+            
+            # Test vectors with extreme values to verify overflow prevention:
+            # vec1: all +1.0 -> quantizes to +127 (max positive int8)
+            # vec2: all +0.99 -> quantizes to ~+126 (similar to vec1)
+            # vec3: all -1.0 -> quantizes to -127/-128 (max negative int8)
+            # vec4: alternating +1.0/-1.0 -> alternating +127/-127 (tests mixed signs)
+            vec1 = [1.0] * dim       # All max positive
+            vec2 = [0.99] * dim      # Similar to vec1
+            vec3 = [-1.0] * dim      # All max negative (opposite direction)
+            vec4 = [1.0 if i % 2 == 0 else -1.0 for i in range(dim)]  # Alternating extreme values
+            
+            # Add vectors with Q8 quantization
+            self.redis.execute_command('VADD', key, 'VALUES', dim, 
+                                     *[str(x) for x in vec1], f'{key}:item:1', 'Q8')
+            self.redis.execute_command('VADD', key, 'VALUES', dim, 
+                                     *[str(x) for x in vec2], f'{key}:item:2', 'Q8')
+            self.redis.execute_command('VADD', key, 'VALUES', dim, 
+                                     *[str(x) for x in vec3], f'{key}:item:3', 'Q8')
+            self.redis.execute_command('VADD', key, 'VALUES', dim, 
+                                     *[str(x) for x in vec4], f'{key}:item:4', 'Q8')
+            
+            # Query similarity using vec1 (all max positive values)
+            # This exercises worst-case positive accumulation: dim * 127 * 127
+            result = self.redis.execute_command('VSIM', key, 'VALUES', dim, 
+                                              *[str(x) for x in vec1], 'WITHSCORES')
+            
+            # Convert results to dictionary
+            results_dict = {}
+            for i in range(0, len(result), 2):
+                k = result[i].decode()
+                score = float(result[i+1])
+                results_dict[k] = score
+            
+            # Verify results - these would be wrong if overflow occurred
+            # Self-similarity should be ~1.0 (identical vectors)
+            assert results_dict[f'{key}:item:1'] > 0.99, \
+                f"Dim {dim}: Self-similarity too low: {results_dict[f'{key}:item:1']}"
+            
+            # Similar vector should have high similarity
+            assert results_dict[f'{key}:item:2'] > 0.99, \
+                f"Dim {dim}: Similar vector similarity too low: {results_dict[f'{key}:item:2']}"
+            
+            # Opposite vector should have very low similarity (~0.0)
+            # With overflow bug, this could give incorrect positive values
+            assert results_dict[f'{key}:item:3'] < 0.1, \
+                f"Dim {dim}: Opposite vector similarity too high: {results_dict[f'{key}:item:3']}"
+            
+            # Alternating vector: dot product sums to ~0, so similarity ~0.5
+            # (127*127) + (127*-127) + ... = 0, normalized gives ~0.5
+            assert 0.4 < results_dict[f'{key}:item:4'] < 0.6, \
+                f"Dim {dim}: Alternating vector similarity unexpected: {results_dict[f'{key}:item:4']}"
+            
+            # Also query with the alternating pattern to verify its self-similarity
+            result_alt = self.redis.execute_command('VSIM', key, 'VALUES', dim,
+                                                  *[str(x) for x in vec4], 'WITHSCORES')
+            results_alt = {}
+            for i in range(0, len(result_alt), 2):
+                k = result_alt[i].decode()
+                score = float(result_alt[i+1])
+                results_alt[k] = score
+            
+            assert results_alt[f'{key}:item:4'] > 0.99, \
+                f"Dim {dim}: Alternating self-similarity too low: {results_alt[f'{key}:item:4']}"