Vectorized the quantized 8-bit vector distance calculation (#14474)
Some checks are pending
CI / test-ubuntu-latest (push) Waiting to run
CI / test-sanitizer-address (push) Waiting to run
CI / build-debian-old (push) Waiting to run
CI / build-macos-latest (push) Waiting to run
CI / build-32bit (push) Waiting to run
CI / build-libc-malloc (push) Waiting to run
CI / build-centos-jemalloc (push) Waiting to run
CI / build-old-chain-jemalloc (push) Waiting to run
Codecov / code-coverage (push) Waiting to run
External Server Tests / test-external-standalone (push) Waiting to run
External Server Tests / test-external-cluster (push) Waiting to run
External Server Tests / test-external-nodebug (push) Waiting to run
Spellcheck / Spellcheck (push) Waiting to run

This pull request vectorizes the 8-bit quantization vector-search path
in a similar was as the non-quantization path.
The assembly intrinsics are a bit more complicated than in the
non-quantization path, since we are operating on 8-bit integers and we
need to worry about preventing overflow. Thus, after loading the 8-bit
integers, they are extended into 16-bits before multiplying and
accumulating into 32-bit integers.

---------

Co-authored-by: debing.sun <debing.sun@redis.com>
This commit is contained in:
Martin Dimitrov 2026-01-26 18:25:59 -07:00 committed by GitHub
parent 48aa1ce524
commit 37f685908e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 305 additions and 2 deletions

View file

@ -64,8 +64,8 @@
#endif
#if defined (HAVE_AVX512)
#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,fma")))
#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f"))
#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,avx512bw,fma")))
#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw"))
#else
#define ATTRIBUTE_TARGET_AVX512
#define VSET_USE_AVX512 0
@ -347,8 +347,155 @@ float vectors_distance_float(const float *x, const float *y, uint32_t dim) {
}
/* Q8 quants dotproduct. We do integer math and later fix it by range. */
#if defined(HAVE_AVX512)
/* AVX512 optimized dot product for Q8 vectors */
ATTRIBUTE_TARGET_AVX512
float vectors_distance_q8_avx512(const int8_t *x, const int8_t *y, uint32_t dim,
float range_a, float range_b) {
// Handle zero vectors special case.
if (range_a == 0 || range_b == 0) {
return 1.0f;
}
const float scale_product = (range_a/127) * (range_b/127);
__m512i sum = _mm512_setzero_si512();
uint32_t i;
/* Process 64 int8 elements at a time with AVX512 */
for (i = 0; i + 63 < dim; i += 64) {
/* Load 64 int8 values */
__m512i vx = _mm512_loadu_si512((__m512i*)&x[i]);
__m512i vy = _mm512_loadu_si512((__m512i*)&y[i]);
/* Unpack and multiply-add in 32-bit precision
* This is done in two steps: lower 32 bytes and upper 32 bytes */
/* Process lower 32 bytes (256 bits) */
__m256i vx_lo = _mm512_extracti64x4_epi64(vx, 0);
__m256i vy_lo = _mm512_extracti64x4_epi64(vy, 0);
/* Extend int8 to int16 */
__m512i vx_lo_16 = _mm512_cvtepi8_epi16(vx_lo);
__m512i vy_lo_16 = _mm512_cvtepi8_epi16(vy_lo);
/* Multiply and accumulate to int32 */
__m512i prod_lo = _mm512_madd_epi16(vx_lo_16, vy_lo_16);
sum = _mm512_add_epi32(sum, prod_lo);
/* Process upper 32 bytes (256 bits) */
__m256i vx_hi = _mm512_extracti64x4_epi64(vx, 1);
__m256i vy_hi = _mm512_extracti64x4_epi64(vy, 1);
__m512i vx_hi_16 = _mm512_cvtepi8_epi16(vx_hi);
__m512i vy_hi_16 = _mm512_cvtepi8_epi16(vy_hi);
__m512i prod_hi = _mm512_madd_epi16(vx_hi_16, vy_hi_16);
sum = _mm512_add_epi32(sum, prod_hi);
}
/* Horizontal sum of the 16 int32 elements in sum */
int32_t dot = _mm512_reduce_add_epi32(sum);
/* Handle remaining elements */
for (; i < dim; i++) {
dot += ((int32_t)x[i]) * ((int32_t)y[i]);
}
/* Convert to original range */
float dotf = dot * scale_product;
float distance = 1.0f - dotf;
/* Clamp distance to [0, 2] */
if (distance < 0) distance = 0;
else if (distance > 2) distance = 2;
return distance;
}
#endif /* HAVE_AVX512 */
#if defined(HAVE_AVX2)
/* AVX2 optimized dot product for Q8 vectors */
ATTRIBUTE_TARGET_AVX2
float vectors_distance_q8_avx2(const int8_t *x, const int8_t *y, uint32_t dim,
float range_a, float range_b) {
// Handle zero vectors special case.
if (range_a == 0 || range_b == 0) {
return 1.0f;
}
const float scale_product = (range_a/127) * (range_b/127);
__m256i sum = _mm256_setzero_si256();
uint32_t i;
/* Process 32 int8 elements at a time with AVX2 */
for (i = 0; i + 31 < dim; i += 32) {
/* Load 32 int8 values */
__m256i vx = _mm256_loadu_si256((__m256i*)&x[i]);
__m256i vy = _mm256_loadu_si256((__m256i*)&y[i]);
/* Split into lower and upper 16 bytes */
__m128i vx_lo = _mm256_extracti128_si256(vx, 0);
__m128i vy_lo = _mm256_extracti128_si256(vy, 0);
__m128i vx_hi = _mm256_extracti128_si256(vx, 1);
__m128i vy_hi = _mm256_extracti128_si256(vy, 1);
/* Extend int8 to int16 for lower half */
__m256i vx_lo_16 = _mm256_cvtepi8_epi16(vx_lo);
__m256i vy_lo_16 = _mm256_cvtepi8_epi16(vy_lo);
/* Multiply and accumulate (madd does multiply adjacent pairs and add) */
__m256i prod_lo = _mm256_madd_epi16(vx_lo_16, vy_lo_16);
sum = _mm256_add_epi32(sum, prod_lo);
/* Extend int8 to int16 for upper half */
__m256i vx_hi_16 = _mm256_cvtepi8_epi16(vx_hi);
__m256i vy_hi_16 = _mm256_cvtepi8_epi16(vy_hi);
__m256i prod_hi = _mm256_madd_epi16(vx_hi_16, vy_hi_16);
sum = _mm256_add_epi32(sum, prod_hi);
}
/* Horizontal sum of the 8 int32 elements in sum */
__m128i sum_hi = _mm256_extracti128_si256(sum, 1);
__m128i sum_lo = _mm256_castsi256_si128(sum);
__m128i sum_128 = _mm_add_epi32(sum_hi, sum_lo);
sum_128 = _mm_hadd_epi32(sum_128, sum_128);
sum_128 = _mm_hadd_epi32(sum_128, sum_128);
int32_t dot = _mm_cvtsi128_si32(sum_128);
/* Handle remaining elements */
for (; i < dim; i++) {
dot += ((int32_t)x[i]) * ((int32_t)y[i]);
}
/* Convert to original range */
float dotf = dot * scale_product;
float distance = 1.0f - dotf;
/* Clamp distance to [0, 2] */
if (distance < 0) distance = 0;
else if (distance > 2) distance = 2;
return distance;
}
#endif /* HAVE_AVX2 */
/* Q8 dot product: automatically selects best available implementation */
float vectors_distance_q8(const int8_t *x, const int8_t *y, uint32_t dim,
float range_a, float range_b) {
#if defined(HAVE_AVX512)
if (dim >= 64 && VSET_USE_AVX512) {
return vectors_distance_q8_avx512(x, y, dim, range_a, range_b);
}
#endif
#if defined(HAVE_AVX2)
if (dim >= 32 && VSET_USE_AVX2) {
return vectors_distance_q8_avx2(x, y, dim, range_a, range_b);
}
#endif
/* Fallback to scalar implementation */
// Handle zero vectors special case.
if (range_a == 0 || range_b == 0) {
/* Zero vector distance from anything is 1.0

View file

@ -0,0 +1,71 @@
from test import TestCase
class Q8Similarity(TestCase):
def getname(self):
return "Q8 quantization: VSIM reported distance makes sense with 4D vectors"
def test(self):
# Add two very similar vectors, one different
# Using same test vectors as basic_similarity.py for comparison
vec1 = [1, 0, 0, 0]
vec2 = [0.99, 0.01, 0, 0]
vec3 = [0.1, 1, -1, 0.5]
# Add vectors using VALUES format with Q8 quantization
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1], f'{self.test_key}:item:1', 'Q8')
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec2], f'{self.test_key}:item:2', 'Q8')
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec3], f'{self.test_key}:item:3', 'Q8')
# Query similarity with vec1
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1], 'WITHSCORES')
# Convert results to dictionary
results_dict = {}
for i in range(0, len(result), 2):
key = result[i].decode()
score = float(result[i+1])
results_dict[key] = score
# Verify results (same expectations as float32, allowing for quantization error)
assert results_dict[f'{self.test_key}:item:1'] > 0.99, "Self-similarity should be very high (Q8)"
assert results_dict[f'{self.test_key}:item:2'] > 0.99, "Similar vector should have high similarity (Q8)"
assert results_dict[f'{self.test_key}:item:3'] < 0.80, "Not very similar vector should have low similarity (Q8)"
# Test extreme values with 512 dimensions to stress-test overflow safety
vec4 = [1.0] * 512 # All +127 after quantization
vec5 = [-1.0] * 512 # All -127 after quantization
vec6 = [1.0, -1.0] * 256 # Alternating +127, -127
# Add vectors using VALUES format with Q8 quantization
self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512,
*[str(x) for x in vec4], f'{self.test_key}:extreme:vec4', 'Q8')
self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512,
*[str(x) for x in vec5], f'{self.test_key}:extreme:vec5', 'Q8')
self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512,
*[str(x) for x in vec6], f'{self.test_key}:extreme:vec6', 'Q8')
# Query vec4 against itself - worst-case positive accumulation (512 * 127 * 127 = 8,258,048)
result_vec4 = self.redis.execute_command('VSIM', f'{self.test_key}:extreme', 'VALUES', 512,
*[str(x) for x in vec4], 'WITHSCORES')
results_vec4 = {}
for i in range(0, len(result_vec4), 2):
key = result_vec4[i].decode()
score = float(result_vec4[i+1])
results_vec4[key] = score
# Verify extreme value handling
# VSIM returns similarity = 1.0 - distance/2.0, so:
# - Distance 0 (identical) → similarity 1.0
# - Distance 2 (opposite) → similarity 0.0
assert results_vec4[f'{self.test_key}:extreme:vec4'] > 0.999, \
f"vec4 self-similarity should be very high, got {results_vec4[f'{self.test_key}:extreme:vec4']}"
assert results_vec4[f'{self.test_key}:extreme:vec5'] < 0.01, \
f"vec4 vs vec5 (opposite extremes) should be near 0, got {results_vec4[f'{self.test_key}:extreme:vec5']}"
# Alternating pattern should result in mid-range similarity (perpendicular)
assert 0.4 < results_vec4[f'{self.test_key}:extreme:vec6'] < 0.6, \
f"vec4 vs vec6 (alternating) should be near 0.5, got {results_vec4[f'{self.test_key}:extreme:vec6']}"

View file

@ -0,0 +1,85 @@
from test import TestCase
class Q8Vectorization(TestCase):
def getname(self):
return "Q8 quantization: verify vectorized vs scalar paths produce consistent results"
def test(self):
# Test with different dimensions to exercise different code paths and boundaries:
# - dim=16: Scalar path (< 32)
# - dim=31: Largest scalar-only dimension (boundary)
# - dim=32: Smallest AVX2 dimension, no remainder (boundary)
# - dim=33: AVX2 with 1-element remainder
# - dim=63: AVX2 with 31-element remainder (largest AVX2-only)
# - dim=64: Smallest AVX512 dimension, no remainder (boundary)
# - dim=65: AVX512 with 1-element remainder
# - dim=128: AVX512 path with no remainder
# - dim=256, dim=512: Large dimensions to test overflow prevention
test_dims = [16, 31, 32, 33, 63, 64, 65, 128, 256, 512]
for dim in test_dims:
key = f'{self.test_key}:dim{dim}'
# Test vectors with extreme values to verify overflow prevention:
# vec1: all +1.0 -> quantizes to +127 (max positive int8)
# vec2: all +0.99 -> quantizes to ~+126 (similar to vec1)
# vec3: all -1.0 -> quantizes to -127/-128 (max negative int8)
# vec4: alternating +1.0/-1.0 -> alternating +127/-127 (tests mixed signs)
vec1 = [1.0] * dim # All max positive
vec2 = [0.99] * dim # Similar to vec1
vec3 = [-1.0] * dim # All max negative (opposite direction)
vec4 = [1.0 if i % 2 == 0 else -1.0 for i in range(dim)] # Alternating extreme values
# Add vectors with Q8 quantization
self.redis.execute_command('VADD', key, 'VALUES', dim,
*[str(x) for x in vec1], f'{key}:item:1', 'Q8')
self.redis.execute_command('VADD', key, 'VALUES', dim,
*[str(x) for x in vec2], f'{key}:item:2', 'Q8')
self.redis.execute_command('VADD', key, 'VALUES', dim,
*[str(x) for x in vec3], f'{key}:item:3', 'Q8')
self.redis.execute_command('VADD', key, 'VALUES', dim,
*[str(x) for x in vec4], f'{key}:item:4', 'Q8')
# Query similarity using vec1 (all max positive values)
# This exercises worst-case positive accumulation: dim * 127 * 127
result = self.redis.execute_command('VSIM', key, 'VALUES', dim,
*[str(x) for x in vec1], 'WITHSCORES')
# Convert results to dictionary
results_dict = {}
for i in range(0, len(result), 2):
k = result[i].decode()
score = float(result[i+1])
results_dict[k] = score
# Verify results - these would be wrong if overflow occurred
# Self-similarity should be ~1.0 (identical vectors)
assert results_dict[f'{key}:item:1'] > 0.99, \
f"Dim {dim}: Self-similarity too low: {results_dict[f'{key}:item:1']}"
# Similar vector should have high similarity
assert results_dict[f'{key}:item:2'] > 0.99, \
f"Dim {dim}: Similar vector similarity too low: {results_dict[f'{key}:item:2']}"
# Opposite vector should have very low similarity (~0.0)
# With overflow bug, this could give incorrect positive values
assert results_dict[f'{key}:item:3'] < 0.1, \
f"Dim {dim}: Opposite vector similarity too high: {results_dict[f'{key}:item:3']}"
# Alternating vector: dot product sums to ~0, so similarity ~0.5
# (127*127) + (127*-127) + ... = 0, normalized gives ~0.5
assert 0.4 < results_dict[f'{key}:item:4'] < 0.6, \
f"Dim {dim}: Alternating vector similarity unexpected: {results_dict[f'{key}:item:4']}"
# Also query with the alternating pattern to verify its self-similarity
result_alt = self.redis.execute_command('VSIM', key, 'VALUES', dim,
*[str(x) for x in vec4], 'WITHSCORES')
results_alt = {}
for i in range(0, len(result_alt), 2):
k = result_alt[i].decode()
score = float(result_alt[i+1])
results_alt[k] = score
assert results_alt[f'{key}:item:4'] > 0.99, \
f"Dim {dim}: Alternating self-similarity too low: {results_alt[f'{key}:item:4']}"