mirror of
https://github.com/redis/redis.git
synced 2026-02-03 20:39:54 -05:00
Vectorized the quantized 8-bit vector distance calculation (#14474)
Some checks are pending
CI / test-ubuntu-latest (push) Waiting to run
CI / test-sanitizer-address (push) Waiting to run
CI / build-debian-old (push) Waiting to run
CI / build-macos-latest (push) Waiting to run
CI / build-32bit (push) Waiting to run
CI / build-libc-malloc (push) Waiting to run
CI / build-centos-jemalloc (push) Waiting to run
CI / build-old-chain-jemalloc (push) Waiting to run
Codecov / code-coverage (push) Waiting to run
External Server Tests / test-external-standalone (push) Waiting to run
External Server Tests / test-external-cluster (push) Waiting to run
External Server Tests / test-external-nodebug (push) Waiting to run
Spellcheck / Spellcheck (push) Waiting to run
Some checks are pending
CI / test-ubuntu-latest (push) Waiting to run
CI / test-sanitizer-address (push) Waiting to run
CI / build-debian-old (push) Waiting to run
CI / build-macos-latest (push) Waiting to run
CI / build-32bit (push) Waiting to run
CI / build-libc-malloc (push) Waiting to run
CI / build-centos-jemalloc (push) Waiting to run
CI / build-old-chain-jemalloc (push) Waiting to run
Codecov / code-coverage (push) Waiting to run
External Server Tests / test-external-standalone (push) Waiting to run
External Server Tests / test-external-cluster (push) Waiting to run
External Server Tests / test-external-nodebug (push) Waiting to run
Spellcheck / Spellcheck (push) Waiting to run
This pull request vectorizes the 8-bit quantization vector-search path in a similar was as the non-quantization path. The assembly intrinsics are a bit more complicated than in the non-quantization path, since we are operating on 8-bit integers and we need to worry about preventing overflow. Thus, after loading the 8-bit integers, they are extended into 16-bits before multiplying and accumulating into 32-bit integers. --------- Co-authored-by: debing.sun <debing.sun@redis.com>
This commit is contained in:
parent
48aa1ce524
commit
37f685908e
3 changed files with 305 additions and 2 deletions
|
|
@ -64,8 +64,8 @@
|
|||
#endif
|
||||
|
||||
#if defined (HAVE_AVX512)
|
||||
#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,fma")))
|
||||
#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f"))
|
||||
#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,avx512bw,fma")))
|
||||
#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw"))
|
||||
#else
|
||||
#define ATTRIBUTE_TARGET_AVX512
|
||||
#define VSET_USE_AVX512 0
|
||||
|
|
@ -347,8 +347,155 @@ float vectors_distance_float(const float *x, const float *y, uint32_t dim) {
|
|||
}
|
||||
|
||||
/* Q8 quants dotproduct. We do integer math and later fix it by range. */
|
||||
#if defined(HAVE_AVX512)
|
||||
/* AVX512 optimized dot product for Q8 vectors */
|
||||
ATTRIBUTE_TARGET_AVX512
|
||||
float vectors_distance_q8_avx512(const int8_t *x, const int8_t *y, uint32_t dim,
|
||||
float range_a, float range_b) {
|
||||
// Handle zero vectors special case.
|
||||
if (range_a == 0 || range_b == 0) {
|
||||
return 1.0f;
|
||||
}
|
||||
|
||||
const float scale_product = (range_a/127) * (range_b/127);
|
||||
__m512i sum = _mm512_setzero_si512();
|
||||
uint32_t i;
|
||||
|
||||
/* Process 64 int8 elements at a time with AVX512 */
|
||||
for (i = 0; i + 63 < dim; i += 64) {
|
||||
/* Load 64 int8 values */
|
||||
__m512i vx = _mm512_loadu_si512((__m512i*)&x[i]);
|
||||
__m512i vy = _mm512_loadu_si512((__m512i*)&y[i]);
|
||||
|
||||
/* Unpack and multiply-add in 32-bit precision
|
||||
* This is done in two steps: lower 32 bytes and upper 32 bytes */
|
||||
|
||||
/* Process lower 32 bytes (256 bits) */
|
||||
__m256i vx_lo = _mm512_extracti64x4_epi64(vx, 0);
|
||||
__m256i vy_lo = _mm512_extracti64x4_epi64(vy, 0);
|
||||
|
||||
/* Extend int8 to int16 */
|
||||
__m512i vx_lo_16 = _mm512_cvtepi8_epi16(vx_lo);
|
||||
__m512i vy_lo_16 = _mm512_cvtepi8_epi16(vy_lo);
|
||||
|
||||
/* Multiply and accumulate to int32 */
|
||||
__m512i prod_lo = _mm512_madd_epi16(vx_lo_16, vy_lo_16);
|
||||
sum = _mm512_add_epi32(sum, prod_lo);
|
||||
|
||||
/* Process upper 32 bytes (256 bits) */
|
||||
__m256i vx_hi = _mm512_extracti64x4_epi64(vx, 1);
|
||||
__m256i vy_hi = _mm512_extracti64x4_epi64(vy, 1);
|
||||
|
||||
__m512i vx_hi_16 = _mm512_cvtepi8_epi16(vx_hi);
|
||||
__m512i vy_hi_16 = _mm512_cvtepi8_epi16(vy_hi);
|
||||
|
||||
__m512i prod_hi = _mm512_madd_epi16(vx_hi_16, vy_hi_16);
|
||||
sum = _mm512_add_epi32(sum, prod_hi);
|
||||
}
|
||||
|
||||
/* Horizontal sum of the 16 int32 elements in sum */
|
||||
int32_t dot = _mm512_reduce_add_epi32(sum);
|
||||
|
||||
/* Handle remaining elements */
|
||||
for (; i < dim; i++) {
|
||||
dot += ((int32_t)x[i]) * ((int32_t)y[i]);
|
||||
}
|
||||
|
||||
/* Convert to original range */
|
||||
float dotf = dot * scale_product;
|
||||
float distance = 1.0f - dotf;
|
||||
|
||||
/* Clamp distance to [0, 2] */
|
||||
if (distance < 0) distance = 0;
|
||||
else if (distance > 2) distance = 2;
|
||||
return distance;
|
||||
}
|
||||
#endif /* HAVE_AVX512 */
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
/* AVX2 optimized dot product for Q8 vectors */
|
||||
ATTRIBUTE_TARGET_AVX2
|
||||
float vectors_distance_q8_avx2(const int8_t *x, const int8_t *y, uint32_t dim,
|
||||
float range_a, float range_b) {
|
||||
// Handle zero vectors special case.
|
||||
if (range_a == 0 || range_b == 0) {
|
||||
return 1.0f;
|
||||
}
|
||||
|
||||
const float scale_product = (range_a/127) * (range_b/127);
|
||||
__m256i sum = _mm256_setzero_si256();
|
||||
uint32_t i;
|
||||
|
||||
/* Process 32 int8 elements at a time with AVX2 */
|
||||
for (i = 0; i + 31 < dim; i += 32) {
|
||||
/* Load 32 int8 values */
|
||||
__m256i vx = _mm256_loadu_si256((__m256i*)&x[i]);
|
||||
__m256i vy = _mm256_loadu_si256((__m256i*)&y[i]);
|
||||
|
||||
/* Split into lower and upper 16 bytes */
|
||||
__m128i vx_lo = _mm256_extracti128_si256(vx, 0);
|
||||
__m128i vy_lo = _mm256_extracti128_si256(vy, 0);
|
||||
__m128i vx_hi = _mm256_extracti128_si256(vx, 1);
|
||||
__m128i vy_hi = _mm256_extracti128_si256(vy, 1);
|
||||
|
||||
/* Extend int8 to int16 for lower half */
|
||||
__m256i vx_lo_16 = _mm256_cvtepi8_epi16(vx_lo);
|
||||
__m256i vy_lo_16 = _mm256_cvtepi8_epi16(vy_lo);
|
||||
|
||||
/* Multiply and accumulate (madd does multiply adjacent pairs and add) */
|
||||
__m256i prod_lo = _mm256_madd_epi16(vx_lo_16, vy_lo_16);
|
||||
sum = _mm256_add_epi32(sum, prod_lo);
|
||||
|
||||
/* Extend int8 to int16 for upper half */
|
||||
__m256i vx_hi_16 = _mm256_cvtepi8_epi16(vx_hi);
|
||||
__m256i vy_hi_16 = _mm256_cvtepi8_epi16(vy_hi);
|
||||
|
||||
__m256i prod_hi = _mm256_madd_epi16(vx_hi_16, vy_hi_16);
|
||||
sum = _mm256_add_epi32(sum, prod_hi);
|
||||
}
|
||||
|
||||
/* Horizontal sum of the 8 int32 elements in sum */
|
||||
__m128i sum_hi = _mm256_extracti128_si256(sum, 1);
|
||||
__m128i sum_lo = _mm256_castsi256_si128(sum);
|
||||
__m128i sum_128 = _mm_add_epi32(sum_hi, sum_lo);
|
||||
|
||||
sum_128 = _mm_hadd_epi32(sum_128, sum_128);
|
||||
sum_128 = _mm_hadd_epi32(sum_128, sum_128);
|
||||
|
||||
int32_t dot = _mm_cvtsi128_si32(sum_128);
|
||||
|
||||
/* Handle remaining elements */
|
||||
for (; i < dim; i++) {
|
||||
dot += ((int32_t)x[i]) * ((int32_t)y[i]);
|
||||
}
|
||||
|
||||
/* Convert to original range */
|
||||
float dotf = dot * scale_product;
|
||||
float distance = 1.0f - dotf;
|
||||
|
||||
/* Clamp distance to [0, 2] */
|
||||
if (distance < 0) distance = 0;
|
||||
else if (distance > 2) distance = 2;
|
||||
return distance;
|
||||
}
|
||||
#endif /* HAVE_AVX2 */
|
||||
|
||||
/* Q8 dot product: automatically selects best available implementation */
|
||||
float vectors_distance_q8(const int8_t *x, const int8_t *y, uint32_t dim,
|
||||
float range_a, float range_b) {
|
||||
#if defined(HAVE_AVX512)
|
||||
if (dim >= 64 && VSET_USE_AVX512) {
|
||||
return vectors_distance_q8_avx512(x, y, dim, range_a, range_b);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
if (dim >= 32 && VSET_USE_AVX2) {
|
||||
return vectors_distance_q8_avx2(x, y, dim, range_a, range_b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Fallback to scalar implementation */
|
||||
// Handle zero vectors special case.
|
||||
if (range_a == 0 || range_b == 0) {
|
||||
/* Zero vector distance from anything is 1.0
|
||||
|
|
|
|||
71
modules/vector-sets/tests/q8_similarity.py
Normal file
71
modules/vector-sets/tests/q8_similarity.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
from test import TestCase
|
||||
|
||||
class Q8Similarity(TestCase):
|
||||
def getname(self):
|
||||
return "Q8 quantization: VSIM reported distance makes sense with 4D vectors"
|
||||
|
||||
def test(self):
|
||||
# Add two very similar vectors, one different
|
||||
# Using same test vectors as basic_similarity.py for comparison
|
||||
vec1 = [1, 0, 0, 0]
|
||||
vec2 = [0.99, 0.01, 0, 0]
|
||||
vec3 = [0.1, 1, -1, 0.5]
|
||||
|
||||
# Add vectors using VALUES format with Q8 quantization
|
||||
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
|
||||
*[str(x) for x in vec1], f'{self.test_key}:item:1', 'Q8')
|
||||
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
|
||||
*[str(x) for x in vec2], f'{self.test_key}:item:2', 'Q8')
|
||||
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
|
||||
*[str(x) for x in vec3], f'{self.test_key}:item:3', 'Q8')
|
||||
|
||||
# Query similarity with vec1
|
||||
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
|
||||
*[str(x) for x in vec1], 'WITHSCORES')
|
||||
|
||||
# Convert results to dictionary
|
||||
results_dict = {}
|
||||
for i in range(0, len(result), 2):
|
||||
key = result[i].decode()
|
||||
score = float(result[i+1])
|
||||
results_dict[key] = score
|
||||
|
||||
# Verify results (same expectations as float32, allowing for quantization error)
|
||||
assert results_dict[f'{self.test_key}:item:1'] > 0.99, "Self-similarity should be very high (Q8)"
|
||||
assert results_dict[f'{self.test_key}:item:2'] > 0.99, "Similar vector should have high similarity (Q8)"
|
||||
assert results_dict[f'{self.test_key}:item:3'] < 0.80, "Not very similar vector should have low similarity (Q8)"
|
||||
|
||||
# Test extreme values with 512 dimensions to stress-test overflow safety
|
||||
vec4 = [1.0] * 512 # All +127 after quantization
|
||||
vec5 = [-1.0] * 512 # All -127 after quantization
|
||||
vec6 = [1.0, -1.0] * 256 # Alternating +127, -127
|
||||
|
||||
# Add vectors using VALUES format with Q8 quantization
|
||||
self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512,
|
||||
*[str(x) for x in vec4], f'{self.test_key}:extreme:vec4', 'Q8')
|
||||
self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512,
|
||||
*[str(x) for x in vec5], f'{self.test_key}:extreme:vec5', 'Q8')
|
||||
self.redis.execute_command('VADD', f'{self.test_key}:extreme', 'VALUES', 512,
|
||||
*[str(x) for x in vec6], f'{self.test_key}:extreme:vec6', 'Q8')
|
||||
|
||||
# Query vec4 against itself - worst-case positive accumulation (512 * 127 * 127 = 8,258,048)
|
||||
result_vec4 = self.redis.execute_command('VSIM', f'{self.test_key}:extreme', 'VALUES', 512,
|
||||
*[str(x) for x in vec4], 'WITHSCORES')
|
||||
results_vec4 = {}
|
||||
for i in range(0, len(result_vec4), 2):
|
||||
key = result_vec4[i].decode()
|
||||
score = float(result_vec4[i+1])
|
||||
results_vec4[key] = score
|
||||
|
||||
# Verify extreme value handling
|
||||
# VSIM returns similarity = 1.0 - distance/2.0, so:
|
||||
# - Distance 0 (identical) → similarity 1.0
|
||||
# - Distance 2 (opposite) → similarity 0.0
|
||||
assert results_vec4[f'{self.test_key}:extreme:vec4'] > 0.999, \
|
||||
f"vec4 self-similarity should be very high, got {results_vec4[f'{self.test_key}:extreme:vec4']}"
|
||||
assert results_vec4[f'{self.test_key}:extreme:vec5'] < 0.01, \
|
||||
f"vec4 vs vec5 (opposite extremes) should be near 0, got {results_vec4[f'{self.test_key}:extreme:vec5']}"
|
||||
|
||||
# Alternating pattern should result in mid-range similarity (perpendicular)
|
||||
assert 0.4 < results_vec4[f'{self.test_key}:extreme:vec6'] < 0.6, \
|
||||
f"vec4 vs vec6 (alternating) should be near 0.5, got {results_vec4[f'{self.test_key}:extreme:vec6']}"
|
||||
85
modules/vector-sets/tests/q8_vectorization.py
Normal file
85
modules/vector-sets/tests/q8_vectorization.py
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
from test import TestCase
|
||||
|
||||
class Q8Vectorization(TestCase):
|
||||
def getname(self):
|
||||
return "Q8 quantization: verify vectorized vs scalar paths produce consistent results"
|
||||
|
||||
def test(self):
|
||||
# Test with different dimensions to exercise different code paths and boundaries:
|
||||
# - dim=16: Scalar path (< 32)
|
||||
# - dim=31: Largest scalar-only dimension (boundary)
|
||||
# - dim=32: Smallest AVX2 dimension, no remainder (boundary)
|
||||
# - dim=33: AVX2 with 1-element remainder
|
||||
# - dim=63: AVX2 with 31-element remainder (largest AVX2-only)
|
||||
# - dim=64: Smallest AVX512 dimension, no remainder (boundary)
|
||||
# - dim=65: AVX512 with 1-element remainder
|
||||
# - dim=128: AVX512 path with no remainder
|
||||
# - dim=256, dim=512: Large dimensions to test overflow prevention
|
||||
|
||||
test_dims = [16, 31, 32, 33, 63, 64, 65, 128, 256, 512]
|
||||
|
||||
for dim in test_dims:
|
||||
key = f'{self.test_key}:dim{dim}'
|
||||
|
||||
# Test vectors with extreme values to verify overflow prevention:
|
||||
# vec1: all +1.0 -> quantizes to +127 (max positive int8)
|
||||
# vec2: all +0.99 -> quantizes to ~+126 (similar to vec1)
|
||||
# vec3: all -1.0 -> quantizes to -127/-128 (max negative int8)
|
||||
# vec4: alternating +1.0/-1.0 -> alternating +127/-127 (tests mixed signs)
|
||||
vec1 = [1.0] * dim # All max positive
|
||||
vec2 = [0.99] * dim # Similar to vec1
|
||||
vec3 = [-1.0] * dim # All max negative (opposite direction)
|
||||
vec4 = [1.0 if i % 2 == 0 else -1.0 for i in range(dim)] # Alternating extreme values
|
||||
|
||||
# Add vectors with Q8 quantization
|
||||
self.redis.execute_command('VADD', key, 'VALUES', dim,
|
||||
*[str(x) for x in vec1], f'{key}:item:1', 'Q8')
|
||||
self.redis.execute_command('VADD', key, 'VALUES', dim,
|
||||
*[str(x) for x in vec2], f'{key}:item:2', 'Q8')
|
||||
self.redis.execute_command('VADD', key, 'VALUES', dim,
|
||||
*[str(x) for x in vec3], f'{key}:item:3', 'Q8')
|
||||
self.redis.execute_command('VADD', key, 'VALUES', dim,
|
||||
*[str(x) for x in vec4], f'{key}:item:4', 'Q8')
|
||||
|
||||
# Query similarity using vec1 (all max positive values)
|
||||
# This exercises worst-case positive accumulation: dim * 127 * 127
|
||||
result = self.redis.execute_command('VSIM', key, 'VALUES', dim,
|
||||
*[str(x) for x in vec1], 'WITHSCORES')
|
||||
|
||||
# Convert results to dictionary
|
||||
results_dict = {}
|
||||
for i in range(0, len(result), 2):
|
||||
k = result[i].decode()
|
||||
score = float(result[i+1])
|
||||
results_dict[k] = score
|
||||
|
||||
# Verify results - these would be wrong if overflow occurred
|
||||
# Self-similarity should be ~1.0 (identical vectors)
|
||||
assert results_dict[f'{key}:item:1'] > 0.99, \
|
||||
f"Dim {dim}: Self-similarity too low: {results_dict[f'{key}:item:1']}"
|
||||
|
||||
# Similar vector should have high similarity
|
||||
assert results_dict[f'{key}:item:2'] > 0.99, \
|
||||
f"Dim {dim}: Similar vector similarity too low: {results_dict[f'{key}:item:2']}"
|
||||
|
||||
# Opposite vector should have very low similarity (~0.0)
|
||||
# With overflow bug, this could give incorrect positive values
|
||||
assert results_dict[f'{key}:item:3'] < 0.1, \
|
||||
f"Dim {dim}: Opposite vector similarity too high: {results_dict[f'{key}:item:3']}"
|
||||
|
||||
# Alternating vector: dot product sums to ~0, so similarity ~0.5
|
||||
# (127*127) + (127*-127) + ... = 0, normalized gives ~0.5
|
||||
assert 0.4 < results_dict[f'{key}:item:4'] < 0.6, \
|
||||
f"Dim {dim}: Alternating vector similarity unexpected: {results_dict[f'{key}:item:4']}"
|
||||
|
||||
# Also query with the alternating pattern to verify its self-similarity
|
||||
result_alt = self.redis.execute_command('VSIM', key, 'VALUES', dim,
|
||||
*[str(x) for x in vec4], 'WITHSCORES')
|
||||
results_alt = {}
|
||||
for i in range(0, len(result_alt), 2):
|
||||
k = result_alt[i].decode()
|
||||
score = float(result_alt[i+1])
|
||||
results_alt[k] = score
|
||||
|
||||
assert results_alt[f'{key}:item:4'] > 0.99, \
|
||||
f"Dim {dim}: Alternating self-similarity too low: {results_alt[f'{key}:item:4']}"
|
||||
Loading…
Reference in a new issue