Vectorize binary quantization path for vectorsets distance calculation (#14492)
Some checks failed
CI / test-ubuntu-latest (push) Waiting to run
CI / test-sanitizer-address (push) Waiting to run
CI / build-debian-old (push) Waiting to run
CI / build-macos-latest (push) Waiting to run
CI / build-32bit (push) Waiting to run
CI / build-libc-malloc (push) Waiting to run
CI / build-centos-jemalloc (push) Waiting to run
CI / build-old-chain-jemalloc (push) Waiting to run
Codecov / code-coverage (push) Waiting to run
External Server Tests / test-external-standalone (push) Waiting to run
External Server Tests / test-external-cluster (push) Waiting to run
External Server Tests / test-external-nodebug (push) Waiting to run
Spellcheck / Spellcheck (push) Waiting to run
Reply-schemas linter / reply-schemas-linter (push) Has been cancelled

This PR adds SIMD vectorization for binary quantization distance
calculation, similar to PR #14474.

---------

Co-authored-by: debing.sun <debing.sun@redis.com>
This commit is contained in:
Martin Dimitrov 2026-01-29 04:59:48 -07:00 committed by GitHub
parent ca681f997e
commit 0024d5dfde
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 168 additions and 16 deletions

View file

@ -54,7 +54,7 @@ performed in the background, while the command is executed in the main thread.
`NOQUANT` forces the vector to be created (in the first VADD call to a given key) without integer 8 quantization, which is otherwise the default.
`BIN` forces the vector to use binary quantization instead of int8. This is much faster and uses less memory, but has impacts on the recall quality.
`BIN` forces the vector to use binary quantization instead of int8. This is much faster and uses less memory, but has impacts on the recall quality. The distance is computed as normalized Hamming distance (`hamming_bits * 2 / dim`), yielding values in [0, 2] consistent with cosine distance semantics, not raw Hamming bit counts.
`Q8` forces the vector to use signed 8 bit quantization. This is the default, and the option only exists in order to make sure to check at insertion time if the vector set is of the same format.

View file

@ -47,40 +47,45 @@
#include "hnsw.h"
#include "mixer.h"
/* Define HAVE_POPCNT if the compiler supports the target("popcnt") attribute */
#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__)))
#if defined(__has_attribute) && __has_attribute(target)
#define HAVE_POPCNT
#define ATTRIBUTE_TARGET_POPCNT __attribute__((target("popcnt")))
#else
#define ATTRIBUTE_TARGET_POPCNT
#endif
#else
#define ATTRIBUTE_TARGET_POPCNT
#endif
/* Check if we can compile SIMD code with function attributes */
/* Check if we can compile SIMD code with function attributes.
* This defines HAVE_AVX2, HAVE_AVX512, and HAVE_POPCNT when the compiler
* supports the target() attribute for runtime CPU feature dispatch. */
#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4))
#if defined(__has_attribute) && __has_attribute(target)
#define HAVE_AVX2
#define HAVE_AVX512
#define HAVE_POPCNT
#endif
#endif
#if defined(HAVE_POPCNT)
#define ATTRIBUTE_TARGET_POPCNT __attribute__((target("popcnt")))
#define VSET_USE_POPCNT __builtin_cpu_supports("popcnt")
#else
#define ATTRIBUTE_TARGET_POPCNT
#define VSET_USE_POPCNT 0
#endif
#if defined(HAVE_AVX2)
#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2,fma")))
#define ATTRIBUTE_TARGET_AVX2_POPCNT __attribute__((target("avx2,fma,popcnt")))
#define VSET_USE_AVX2 (__builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma"))
#else
#define ATTRIBUTE_TARGET_AVX2
#define ATTRIBUTE_TARGET_AVX2_POPCNT
#define VSET_USE_AVX2 0
#endif
#if defined (HAVE_AVX512)
#define ATTRIBUTE_TARGET_AVX512 __attribute__((target("avx512f,avx512bw,fma")))
#define ATTRIBUTE_TARGET_AVX512_VPOPCNT __attribute__((target("avx512f,fma,avx512vpopcntdq,popcnt")))
#define VSET_USE_AVX512 (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw"))
#define VSET_USE_AVX512_VPOPCNT (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512vpopcntdq"))
#else
#define ATTRIBUTE_TARGET_AVX512
#define ATTRIBUTE_TARGET_AVX512_VPOPCNT
#define VSET_USE_AVX512 0
#define VSET_USE_AVX512_VPOPCNT 0
#endif
/* Include SIMD headers when supported */
@ -606,9 +611,102 @@ float vectors_distance_q8(const int8_t *x, const int8_t *y, uint32_t dim,
return distance;
}
/* Binary vectors distance. */
#if defined(HAVE_AVX512) && defined(HAVE_POPCNT)
/* AVX-512 vectorized binary distance calculation using VPOPCNTDQ.
* Processes 8 uint64_t (512 bits) per iteration.
*
* Uses _mm512_popcnt_epi64 hardware popcount instruction which requires
* AVX512VPOPCNTDQ extension
*/
ATTRIBUTE_TARGET_AVX512_VPOPCNT
static float vectors_distance_bin_avx512_vpopcnt(const uint64_t *x, const uint64_t *y, uint32_t dim) {
uint32_t len = (dim+63)/64;
uint32_t opposite = 0;
uint32_t j = 0;
/* Process 8 uint64_t (512 bits) at a time with hardware popcount */
if (len >= 8) {
__m512i sum = _mm512_setzero_si512();
for (; j + 7 < len; j += 8) {
__m512i vx = _mm512_loadu_si512((__m512i*)&x[j]);
__m512i vy = _mm512_loadu_si512((__m512i*)&y[j]);
__m512i vxor = _mm512_xor_si512(vx, vy);
/* Hardware popcount for 64-bit integers (AVX512VPOPCNTDQ) */
__m512i popcnt = _mm512_popcnt_epi64(vxor);
sum = _mm512_add_epi64(sum, popcnt);
}
/* Horizontal sum: reduce 8x 64-bit integers to scalar */
opposite = _mm512_reduce_add_epi64(sum);
}
/* Handle remaining elements */
for (; j < len; j++) {
uint64_t xor = x[j] ^ y[j];
opposite += __builtin_popcountll(xor);
}
return (float)opposite * 2.0f / dim;
}
#endif
#if defined(HAVE_AVX2) && defined(HAVE_POPCNT)
/* AVX2 vectorized binary distance calculation.
* Processes 4 uint64_t (256 bits) per iteration. */
ATTRIBUTE_TARGET_AVX2_POPCNT
static float vectors_distance_bin_avx2(const uint64_t *x, const uint64_t *y, uint32_t dim) {
uint32_t len = (dim+63)/64;
uint32_t opposite = 0;
uint32_t j = 0;
/* Process 4 uint64_t (256 bits) at a time */
if (len >= 4) {
for (; j + 3 < len; j += 4) {
__m256i vx = _mm256_loadu_si256((__m256i*)&x[j]);
__m256i vy = _mm256_loadu_si256((__m256i*)&y[j]);
__m256i vxor = _mm256_xor_si256(vx, vy);
/* Extract and use hardware POPCNT instruction */
uint64_t xor_vals[4];
_mm256_storeu_si256((__m256i*)xor_vals, vxor);
opposite += __builtin_popcountll(xor_vals[0]);
opposite += __builtin_popcountll(xor_vals[1]);
opposite += __builtin_popcountll(xor_vals[2]);
opposite += __builtin_popcountll(xor_vals[3]);
}
}
/* Handle remaining elements */
for (; j < len; j++) {
uint64_t xor = x[j] ^ y[j];
opposite += __builtin_popcountll(xor);
}
return (float)opposite * 2.0f / dim;
}
#endif
/* Binary vectors distance with SIMD dispatch. */
ATTRIBUTE_TARGET_POPCNT
float vectors_distance_bin(const uint64_t *x, const uint64_t *y, uint32_t dim) {
#if defined(HAVE_AVX512) && defined(HAVE_POPCNT)
/* AVX-512 with VPOPCNTDQ */
if (dim >= 512 && VSET_USE_AVX512_VPOPCNT) {
return vectors_distance_bin_avx512_vpopcnt(x, y, dim);
}
#endif
#if defined(HAVE_AVX2) && defined(HAVE_POPCNT)
/* AVX2 path: processes 4 uint64_t (256 bits) per iteration */
if (dim >= 256 && VSET_USE_AVX2 && VSET_USE_POPCNT) {
return vectors_distance_bin_avx2(x, y, dim);
}
#endif
/* Fallback to scalar implementation with runtime POPCNT detection */
return hnsw_vectors_distance_bin(x, y, dim);
}

View file

@ -0,0 +1,54 @@
from test import TestCase
class BinVectorization(TestCase):
def getname(self):
return "Binary quantization: verify vectorized vs scalar paths produce consistent results"
def test(self):
# Test with different dimensions to exercise different code paths:
# - dim=1: Edge case for minimal valid dimension (scalar path)
# - dim=64: Exact alignment boundary, one uint64_t word (scalar path)
# - dim=128: Scalar path (< 256)
# - dim=384: AVX2 path if available (>= 256, < 512)
# - dim=768: AVX512 path if available (>= 512)
# Note: dim=0 is not tested as it's invalid input (division by zero)
test_dims = [1, 64, 128, 384, 768]
for dim in test_dims:
# Add two very similar vectors, one different
vec1 = [1.0] * dim
vec2 = [0.99] * dim # Very similar to vec1
vec3 = [-1.0] * dim # Opposite direction - should have low similarity
# Add vectors with binary quantization
self.redis.execute_command('VADD', f'{self.test_key}:dim{dim}', 'VALUES', dim,
*[str(x) for x in vec1], f'{self.test_key}:dim{dim}:item:1', 'BIN')
self.redis.execute_command('VADD', f'{self.test_key}:dim{dim}', 'VALUES', dim,
*[str(x) for x in vec2], f'{self.test_key}:dim{dim}:item:2', 'BIN')
self.redis.execute_command('VADD', f'{self.test_key}:dim{dim}', 'VALUES', dim,
*[str(x) for x in vec3], f'{self.test_key}:dim{dim}:item:3', 'BIN')
# Query similarity
result = self.redis.execute_command('VSIM', f'{self.test_key}:dim{dim}', 'VALUES', dim,
*[str(x) for x in vec1], 'WITHSCORES')
# Convert results to dictionary
results_dict = {}
for i in range(0, len(result), 2):
key = result[i].decode()
score = float(result[i+1])
results_dict[key] = score
# Verify results are consistent across dimensions
# Self-similarity should be very high (binary quantization is less precise)
assert results_dict[f'{self.test_key}:dim{dim}:item:1'] > 0.99, \
f"Dim {dim}: Self-similarity too low: {results_dict[f'{self.test_key}:dim{dim}:item:1']}"
# Similar vector should have high similarity (binary quant loses some precision)
assert results_dict[f'{self.test_key}:dim{dim}:item:2'] > 0.95, \
f"Dim {dim}: Similar vector similarity too low: {results_dict[f'{self.test_key}:dim{dim}:item:2']}"
# Opposite vector should have very low similarity
assert results_dict[f'{self.test_key}:dim{dim}:item:3'] < 0.1, \
f"Dim {dim}: Opposite vector similarity too high: {results_dict[f'{self.test_key}:dim{dim}:item:3']}"