Add 'modules/vector-sets/' from commit 'c6db0a7c20ff5638f3a0c9ce9c106303daeb2f67'

git-subtree-dir: modules/vector-sets
git-subtree-mainline: 8ea8f4220c
git-subtree-split: c6db0a7c20
This commit is contained in:
YaacovHazan 2025-04-02 16:34:28 +03:00
commit 78e0d87177
41 changed files with 14928 additions and 0 deletions

11
modules/vector-sets/.gitignore vendored Normal file
View file

@ -0,0 +1,11 @@
__pycache__
misc
*.so
*.xo
*.o
.DS_Store
w2v
word2vec.bin
TODO
*.txt
*.rdb

View file

@ -0,0 +1,2 @@
This code is Copyright (c) 2024-Present, Redis Ltd.
All Rights Reserved.

View file

@ -0,0 +1,84 @@
# Compiler settings
CC = cc
ifdef SANITIZER
ifeq ($(SANITIZER),address)
SAN=-fsanitize=address
else
ifeq ($(SANITIZER),undefined)
SAN=-fsanitize=undefined
else
ifeq ($(SANITIZER),thread)
SAN=-fsanitize=thread
else
$(error "unknown sanitizer=${SANITIZER}")
endif
endif
endif
endif
CFLAGS = -O2 -Wall -Wextra -g $(SAN) -std=c11
LDFLAGS = -lm $(SAN)
# Detect OS
uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
# Shared library compile flags for linux / osx
ifeq ($(uname_S),Linux)
SHOBJ_CFLAGS ?= -W -Wall -fno-common -g -ggdb -std=c11 -O2
SHOBJ_LDFLAGS ?= -shared
ifneq (,$(findstring armv,$(uname_M)))
SHOBJ_LDFLAGS += -latomic
endif
ifneq (,$(findstring aarch64,$(uname_M)))
SHOBJ_LDFLAGS += -latomic
endif
else
SHOBJ_CFLAGS ?= -W -Wall -dynamic -fno-common -g -ggdb -std=c11 -O3
SHOBJ_LDFLAGS ?= -bundle -undefined dynamic_lookup
endif
# OS X 11.x doesn't have /usr/lib/libSystem.dylib and needs an explicit setting.
ifeq ($(uname_S),Darwin)
ifeq ("$(wildcard /usr/lib/libSystem.dylib)","")
LIBS = -L /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib -lsystem
endif
endif
.SUFFIXES: .c .so .xo .o
all: vset.so
.c.xo:
$(CC) -I. $(CFLAGS) $(SHOBJ_CFLAGS) -fPIC -c $< -o $@
vset.xo: redismodule.h expr.c
vset.so: vset.xo hnsw.xo cJSON.xo
$(CC) -o $@ $^ $(SHOBJ_LDFLAGS) $(LIBS) $(SAN) -lc
# Example sources / objects
SRCS = hnsw.c w2v.c
OBJS = $(SRCS:.c=.o)
TARGET = w2v
MODULE = vset.so
# Default target
all: $(TARGET) $(MODULE)
# Example linking rule
$(TARGET): $(OBJS)
$(CC) $(OBJS) $(LDFLAGS) -o $(TARGET)
# Compilation rule for object files
%.o: %.c
$(CC) $(CFLAGS) -c $< -o $@
# Clean rule
clean:
rm -f $(TARGET) $(OBJS) *.xo *.so
# Declare phony targets
.PHONY: all clean

View file

@ -0,0 +1,633 @@
This module implements Vector Sets for Redis, a new Redis data type similar
to Sorted Sets but having string elements associated to a vector instead of
a score. The fundamental goal of Vector Sets is to make possible adding items,
and later get a subset of the added items that are the most similar to a
specified vector (often a learned embedding), or the most similar to the vector
of an element that is already part of the Vector Set.
Moreover, Vector sets implement optional filtered search capabilities: it is possible to associate attributes to all or to a subset of elements in the set, and then, using the `FILTER` option of the `VSIM` command, to ask for items similar to a given vector but also passing a filter specified as a simple mathematical expression (Like `".year > 1950"` or similar). This means that **you can have vector similarity and scalar filters at the same time**.
## Installation
Build with:
make
Then load the module with the following command line, or by inserting the needed directives in the `redis.conf` file.
./redis-server --loadmodule vset.so
To run tests, I suggest using this:
./redis-server --save "" --enable-debug-command yes
The execute the tests with:
./test.py
## Reference of available commands
**VADD: add items into a vector set**
VADD key [REDUCE dim] FP32|VALUES vector element [CAS] [NOQUANT | Q8 | BIN]
[EF build-exploration-factor] [SETATTR <attributes>] [M <numlinks>]
Add a new element into the vector set specified by the key.
The vector can be provided as FP32 blob of values, or as floating point
numbers as strings, prefixed by the number of elements (3 in the example):
VADD mykey VALUES 3 0.1 1.2 0.5 my-element
Meaning of the options:
`REDUCE` implements random projection, in order to reduce the
dimensionality of the vector. The projection matrix is saved and reloaded
along with the vector set. **Please note that** the `REDUCE` option must be passed immediately before the vector, like in `REDUCE 50 VALUES ...`.
`CAS` performs the operation partially using threads, in a
check-and-set style. The neighbor candidates collection, which is slow, is
performed in the background, while the command is executed in the main thread.
`NOQUANT` forces the vector to be created (in the first VADD call to a given key) without integer 8 quantization, which is otherwise the default.
`BIN` forces the vector to use binary quantization instead of int8. This is much faster and uses less memory, but has impacts on the recall quality.
`Q8` forces the vector to use signed 8 bit quantization. This is the default, and the option only exists in order to make sure to check at insertion time if the vector set is of the same format.
`EF` plays a role in the effort made to find good candidates when connecting the new node to the existing HNSW graph. The default is 200. Using a larger value, may help to have a better recall. To improve the recall it is also possible to increase `EF` during `VSIM` searches.
`SETATTR` associates attributes to the newly created entry or update the entry attributes (if it already exists). It is the same as calling the `VSETATTR` attribute separately, so please check the documentation of that command in the filtered search section of this documentation.
`M` defaults to 16 and is the HNSW famous `M` parameters. It is the maximum number of connections that each node of the graph have with other nodes: more connections mean more memory, but a better ability to explore the graph. Nodes at layer zero (every node exists at least at layer zero) have `M*2` connections, while the other layers only have `M` connections. This means that, for instance, an `M` of 64 will use at least 1024 bytes of memory for each node! That is, `64 links * 2 times * 8 bytes pointers`, and even more, since on average each node has something like 1.33 layers (but the other layers have just `M` connections, instead of `M*2`). If you don't have a recall quality problem, the default is fine, and uses a limited amount of memory.
**VSIM: return elements by vector similarity**
VSIM key [ELE|FP32|VALUES] <vector or element> [WITHSCORES] [COUNT num] [EF search-exploration-factor] [FILTER expression] [FILTER-EF max-filtering-effort] [TRUTH] [NOTHREAD]
The command returns similar vectors, for simplicity (and verbosity) in the following example, instead of providing a vector using FP32 or VALUES (like in `VADD`), we will ask for elements having a vector similar to a given element already in the sorted set:
> VSIM word_embeddings ELE apple
1) "apple"
2) "apples"
3) "pear"
4) "fruit"
5) "berry"
6) "pears"
7) "strawberry"
8) "peach"
9) "potato"
10) "grape"
It is possible to specify a `COUNT` and also to get the similarity score (from 1 to 0, where 1 is identical, 0 is opposite vector) between the query and the returned items.
> VSIM word_embeddings ELE apple WITHSCORES COUNT 3
1) "apple"
2) "0.9998867657923256"
3) "apples"
4) "0.8598527610301971"
5) "pear"
6) "0.8226882219314575"
The `EF` argument is the exploration factor: the higher it is, the slower the command becomes, but the better the index is explored to find nodes that are near to our query. Sensible values are from 50 to 1000.
The `TRUTH` option forces the command to perform a linear scan of all the entries inside the set, without using the graph search inside the HNSW, so it returns the best matching elements (the perfect result set) that can be used in order to easily calculate the recall. Of course the linear scan is `O(N)`, so it is much slower than the `log(N)` (considering a small `COUNT`) provided by the HNSW index.
The `NOTHREAD` option forces the command to execute the search on the data structure in the main thread. Normally `VSIM` spawns a thread instead. This may be useful for benchmarking purposes, or when we work with extremely small vector sets and don't want to pay the cost of spawning a thread. It is possible that in the future this option will be automatically used by Redis when we detect small vector sets. Note that this option blocks the server for all the time needed to complete the command, so it is a source of potential latency issues: if you are in doubt, never use it.
For `FILTER` and `FILTER-EF` options, please check the filtered search section of this documentation.
**VDIM: return the dimension of the vectors inside the vector set**
VDIM keyname
Example:
> VDIM word_embeddings
(integer) 300
Note that in the case of vectors that were populated using the `REDUCE`
option, for random projection, the vector set will report the size of
the projected (reduced) dimension. Yet the user should perform all the
queries using full-size vectors.
**VCARD: return the number of elements in a vector set**
VCARD key
Example:
> VCARD word_embeddings
(integer) 3000000
**VREM: remove elements from vector set**
VREM key element
Example:
> VADD vset VALUES 3 1 0 1 bar
(integer) 1
> VREM vset bar
(integer) 1
> VREM vset bar
(integer) 0
VREM does not perform thumstone / logical deletion, but will actually reclaim
the memory from the vector set, so it is save to add and remove elements
in a vector set in the context of long running applications that continuously
update the same index.
**VEMB: return the approximated vector of an element**
VEMB key element
Example:
> VEMB word_embeddings SQL
1) "0.18208661675453186"
2) "0.08535309880971909"
3) "0.1365649551153183"
4) "-0.16501599550247192"
5) "0.14225517213344574"
... 295 more elements ...
Because vector sets perform insertion time normalization and optional
quantization, the returned vector could be approximated. `VEMB` will take
care to de-quantized and de-normalize the vector before returning it.
It is possible to ask VEMB to return raw data, that is, the interal representation used by the vector: fp32, int8, or a bitmap for binary quantization. This behavior is triggered by the `RAW` option of of VEMB:
VEMB word_embedding apple RAW
In this case the return value of the command is an array of three or more elements:
1. The name of the quantization used, that is one of: "fp32", "bin", "q8".
2. The a string blob containing the raw data, 4 bytes fp32 floats for fp32, a bitmap for binary quants, or int8 bytes array for q8 quants.
3. A float representing the l2 of the vector before normalization. You need to multiply by this vector if you want to de-normalize the value for any reason.
For q8 quantization, an additional elements is also returned: the quantization
range, so the integers from -127 to 127 represent (normalized) components
in the range `-range`, `+range`.
**VLINKS: introspection command that shows neighbors for a node**
VLINKS key element [WITHSCORES]
The command reports the neighbors for each level.
**VINFO: introspection command that shows info about a vector set**
VINFO key
Example:
> VINFO word_embeddings
1) quant-type
2) int8
3) vector-dim
4) (integer) 300
5) size
6) (integer) 3000000
7) max-level
8) (integer) 12
9) vset-uid
10) (integer) 1
11) hnsw-max-node-uid
12) (integer) 3000000
**VSETATTR: associate or remove the JSON attributes of elements**
VSETATTR key element "{... json ...}"
Each element of a vector set can be optionally associated with a JSON string
in order to use the `FILTER` option of `VSIM` to filter elements by scalars
(see the filtered search section for more information). This command can set,
update (if already set) or delete (if you set to an empty string) the
associated JSON attributes of an element.
The command returns 0 if the element or the key don't exist, without
raising an error, otherwise 1 is returned, and the element attributes
are set or updated.
**VGETATTR: retrieve the JSON attributes of elements**
VGETATTR key element
The command returns the JSON attribute associated with an element, or
null if there is no element associated, or no element at all, or no key.
**VRANDMEMBER: return random members from a vector set**
VRANDMEMBER key [count]
Return one or more random elements from a vector set.
The semantics of this command are similar to Redis's native SRANDMEMBER command:
- When called without count, returns a single random element from the set, as a single string (no array reply).
- When called with a positive count, returns up to count distinct random elements (no duplicates).
- When called with a negative count, returns count random elements, potentially with duplicates.
- If the count value is larger than the set size (and positive), only the entire set is returned.
If the key doesn't exist, returns a Null reply if count is not given, or an empty array if a count is provided.
Examples:
> VADD vset VALUES 3 1 0 0 elem1
(integer) 1
> VADD vset VALUES 3 0 1 0 elem2
(integer) 1
> VADD vset VALUES 3 0 0 1 elem3
(integer) 1
# Return a single random element
> VRANDMEMBER vset
"elem2"
# Return 2 distinct random elements
> VRANDMEMBER vset 2
1) "elem1"
2) "elem3"
# Return 3 random elements with possible duplicates
> VRANDMEMBER vset -3
1) "elem2"
2) "elem2"
3) "elem1"
# Return more elements than in the set (returns all elements)
> VRANDMEMBER vset 10
1) "elem1"
2) "elem2"
3) "elem3"
# When key doesn't exist
> VRANDMEMBER nonexistent
(nil)
> VRANDMEMBER nonexistent 3
(empty array)
This command is particularly useful for:
1. Selecting random samples from a vector set for testing or training.
2. Performance testing by retrieving random elements for subsequent similarity searches.
When the user asks for unique elements (positev count) the implementation optimizes for two scenarios:
- For small sample sizes (less than 20% of the set size), it uses a dictionary to avoid duplicates, and performs a real random walk inside the graph.
- For large sample sizes (more than 20% of the set size), it starts from a random node and sequentially traverses the internal list, providing faster performances but not really "random" elements.
The command has `O(N)` worst-case time complexity when requesting many unique elements (it uses linear scanning), or `O(M*log(N))` complexity when the users asks for `M` random elements in a sorted set of `N` elements, with `M` much smaller than `N`.
# Filtered search
Each element of the vector set can be associated with a set of attributes specified as a JSON blob:
> VADD vset VALUES 3 1 1 1 a SETATTR '{"year": 1950}'
(integer) 1
> VADD vset VALUES 3 -1 -1 -1 b SETATTR '{"year": 1951}'
(integer) 1
Specifying an attribute with the `SETATTR` option of `VADD` is exactly equivalent to adding an element and then setting (or updating, if already set) the attributes JSON string. Also the symmetrical `VGETATTR` command returns the attribute associated to a given element.
> VADD vset VALUES 3 0 1 0 c
(integer) 1
> VSETATTR vset c '{"year": 1952}'
(integer) 1
> VGETATTR vset c
"{\"year\": 1952}"
At this point, I may use the FILTER option of VSIM to only ask for the subset of elements that are verified by my expression:
> VSIM vset VALUES 3 0 0 0 FILTER '.year > 1950'
1) "c"
2) "b"
The items will be returned again in order of similarity (most similar first), but only the items with the year field matching the expression is returned.
The expressions are similar to what you would write inside the `if` statement of JavaScript or other familiar programming languages: you can use `and`, `or`, the obvious math operators like `+`, `-`, `/`, `>=`, `<`, ... and so forth (see the expressions section for more info). The selectors of the JSON object attributes start with a dot followed by the name of the key inside the JSON objects.
Elements with invalid JSON or not having a given specified field **are considered as not matching** the expression, but will not generate any error at runtime.
## FILTER expressions capabilities
FILTER expressions allow you to perform complex filtering on vector similarity results using a JavaScript-like syntax. The expression is evaluated against each element's JSON attributes, with only elements that satisfy the expression being included in the results.
### Expression Syntax
Expressions support the following operators and capabilities:
1. **Arithmetic operators**: `+`, `-`, `*`, `/`, `%` (modulo), `**` (exponentiation)
2. **Comparison operators**: `>`, `>=`, `<`, `<=`, `==`, `!=`
3. **Logical operators**: `and`/`&&`, `or`/`||`, `!`/`not`
4. **Containment operator**: `in`
5. **Parentheses** for grouping: `(...)`
### Selector Notation
Attributes are accessed using dot notation:
- `.year` references the "year" attribute
- `.movie.year` would **NOT** reference the "year" field inside a "movie" object, only keys that are at the first level of the JSON object are accessible.
### JSON and expressions data types
Expressions can work with:
- Numbers (dobule precision floats)
- Strings (enclosed in single or double quotes)
- Booleans (no native type: they are represented as 1 for true, 0 for false)
- Arrays (for use with the `in` operator: `value in [1, 2, 3]`)
JSON attributes are converted in this way:
- Numbers will be converted to numbers.
- Strings to strings.
- Booleans to 0 or 1 number.
- Arrays to tuples (for "in" operator), but only if composed of just numbers and strings.
Any other type is ignored, and accessig it will make the expression evaluate to false.
### Examples
```
# Find items from the 1980s
VSIM movies VALUES 3 0.5 0.8 0.2 FILTER '.year >= 1980 and .year < 1990'
# Find action movies with high ratings
VSIM movies VALUES 3 0.5 0.8 0.2 FILTER '.genre == "action" and .rating > 8.0'
# Find movies directed by either Spielberg or Nolan
VSIM movies VALUES 3 0.5 0.8 0.2 FILTER '.director in ["Spielberg", "Nolan"]'
# Complex condition with numerical operations
VSIM movies VALUES 3 0.5 0.8 0.2 FILTER '(.year - 2000) ** 2 < 100 and .rating / 2 > 4'
```
### Error Handling
Elements with any of the following conditions are considered not matching:
- Missing the queried JSON attribute
- Having invalid JSON in their attributes
- Having a JSON value that cannot be converted to the expected type
This behavior allows you to safely filter on optional attributes without generating errors.
### FILTER effort
The `FILTER-EF` option controls the maximum effort spent when filtering vector search results.
When performing vector similarity search with filtering, Vector Sets perform the standard similarity search as they apply the filter expression to each node. Since many results might be filtered out, Vector Sets may need to examine a lot more candidates than the requested `COUNT` to ensure sufficient matching results are returned. Actually, if the elements matching the filter are very rare or if there are less than elements matching than the specified count, this would trigger a full scan of the HNSW graph.
For this reason, by default, the maximum effort is limited to a reasonable amount of nodes explored.
### Modifying the FILTER effort
1. By default, Vector Sets will explore up to `COUNT * 100` candidates to find matching results.
2. You can control this exploration with the `FILTER-EF` parameter.
3. A higher `FILTER-EF` value increases the chances of finding all relevant matches at the cost of increased processing time.
4. A `FILTER-EF` of zero will explore as many nodes as needed in order to actually return the number of elements specified by `COUNT`.
5. Even when a high `FILTER-EF` value is specified **the implementation will do a lot less work** if the elements passing the filter are very common, because of the early stop conditions of the HNSW implementation (once the specified amount of elements is reached and the quality check of the other candidates trigger an early stop).
```
VSIM key [ELE|FP32|VALUES] <vector or element> COUNT 10 FILTER '.year > 2000' FILTER-EF 500
```
In this example, Vector Sets will examine up to 500 potential nodes. Of course if count is reached before exploring 500 nodes, and the quality checks show that it is not possible to make progresses on similarity, the search is ended sooner.
### Performance Considerations
- If you have highly selective filters (few items match), use a higher `FILTER-EF`, or just design your application in order to handle a result set that is smaller than the requested count. Note that anyway the additional elements may be too distant than the query vector.
- For less selective filters, the default should be sufficient.
- Very selective filters with low `FILTER-EF` values may return fewer items than requested.
- Extremely high values may impact performance without significantly improving results.
The optimal `FILTER-EF` value depends on:
1. The selectivity of your filter.
2. The distribution of your data.
3. The required recall quality.
A good practice is to start with the default and increase if needed when you observe fewer results than expected.
### Testing a larg-ish data set
To really see how things work at scale, you can [download](https://antirez.com/word2vec_with_attribs.rdb) the following dataset:
wget https://antirez.com/word2vec_with_attribs.rdb
It contains the 3 million words in Word2Vec having as attribute a JSON with just the length of the word. Because of the length distribution of words in large amounts of texts, where longer words become less and less common, this is ideal to check how filtering behaves with a filter verifying as true with less and less elements in a vector set.
For instance:
> VSIM word_embeddings_bin ele "pasta" FILTER ".len == 6"
1) "pastas"
2) "rotini"
3) "gnocci"
4) "panino"
5) "salads"
6) "breads"
7) "salame"
8) "sauces"
9) "cheese"
10) "fritti"
This will easily retrieve the desired amount of items (`COUNT` is 10 by default) since there are many items of length 6. However:
> VSIM word_embeddings_bin ele "pasta" FILTER ".len == 33"
1) "skinless_boneless_chicken_breasts"
2) "boneless_skinless_chicken_breasts"
3) "Boneless_skinless_chicken_breasts"
This time even if we asked for 10 items, we only get 3, since the default filter effort will be `10*100 = 1000`. We can tune this giving the effort in an explicit way, with the risk of our query being slower, of course:
> VSIM word_embeddings_bin ele "pasta" FILTER ".len == 33" FILTER-EF 10000
1) "skinless_boneless_chicken_breasts"
2) "boneless_skinless_chicken_breasts"
3) "Boneless_skinless_chicken_breasts"
4) "mozzarella_feta_provolone_cheddar"
5) "Greatfood.com_R_www.greatfood.com"
6) "Pepperidge_Farm_Goldfish_crackers"
7) "Prosecuted_Mobsters_Rebuilt_Dying"
8) "Crispy_Snacker_Sandwiches_Popcorn"
9) "risultati_delle_partite_disputate"
10) "Peppermint_Mocha_Twist_Gingersnap"
This time we get all the ten items, even if the last one will be quite far from our query vector. We encourage to experiment with this test dataset in order to understand better the dynamics of the implementation and the natural tradeoffs of filtered search.
**Keep in mind** that by default, Redis Vector Sets will try to avoid a likely very useless huge scan of the HNSW graph, and will be more happy to return few or no elements at all, since this is almost always what the user actually wants in the context of retrieving *similar* items to the query.
# Single Instance Scalability and Latency
Vector Sets implement a threading model that allows Redis to handle many concurrent requests: by default `VSIM` is always threaded, and `VADD` is not (but can be partially threaded using the `CAS` option). This section explains how the threading and locking mechanisms work, and what to expect in terms of performance.
## Threading Model
- The `VSIM` command runs in a separate thread by default, allowing Redis to continue serving other commands.
- A maximum of 32 threads can run concurrently (defined by `HNSW_MAX_THREADS`).
- When this limit is reached, additional `VSIM` requests are queued - Redis remains responsive, no latency event is generated.
- The `VADD` command with the `CAS` option also leverages threading for the computation-heavy candidate search phase, but the insertion itself is performed in the main thread. `VADD` always runs in a sub-millisecond time, so this is not a source of latency, but having too many hundreds of writes per second can be challenging to handle with a single instance. Please, look at the next section about multiple instances scalability.
- Commands run within Lua scripts, MULTI/EXEC blocks, or from replication are executed in the main thread to ensure consistency.
```
> VSIM vset VALUES 3 1 1 1 FILTER '.year > 2000' # This runs in a thread.
> VADD vset VALUES 3 1 1 1 element CAS # Candidate search runs in a thread.
```
## Locking Mechanism
Vector Sets use a read/write locking mechanism to coordinate access:
- Reads (`VSIM`, `VEMB`, etc.) acquire a read lock, allowing multiple concurrent reads.
- Writes (`VADD`, `VREM`, etc.) acquire a write lock, temporarily blocking all reads.
- When a write lock is requested while reads are in progress, the write operation waits for all reads to complete.
- Once a write lock is granted, all reads are blocked until the write completes.
- Each thread has a dedicated slot for tracking visited nodes during graph traversal, avoiding contention. This improves performances but limits the maximum number of concurrent threads, since each node has a memory cost proportional to the number of slots.
## DEL latency
Deleting a very large vector set (millions of elements) can cause latency spikes, as deletion rebuilds connections between nodes. This may change in the future.
The deletion latency is most noticeable when using `DEL` on a key containing a large vector set or when the key expires.
## Performance Characteristics
- Search operations (`VSIM`) scale almost linearly with the number of CPU cores available, up to the thread limit. You can expect a Vector Set composed of million of items associated with components of dimension 300, with the default int8 quantization, to deliver around 50k VSIM operations per second in a single host.
- Insertion operations (`VADD`) are more computationally expensive than searches, and can't be threaded: expect much lower throughput, in the range of a few thousands inserts per second.
- Binary quantization offers significantly faster search performance at the cost of some recall quality, while int8 quantization, the default, seems to have very small impacts on recall quality, while it significantly improves performances and space efficiency.
- The `EF` parameter has a major impact on both search quality and performance - higher values mean better recall but slower searches.
- Graph traversal time scales logarithmically with the number of elements, making Vector Sets efficient even with millions of vectors
## Loading / Saving performances
Vector Sets are able to serialize on disk the graph structure as it is in memory, so loading back the data does not need to rebuild the HNSW graph. This means that Redis can load millions of items per minute. For instance 3 million items with 300 components vectors can be loaded back into memory into around 15 seconds.
# Scaling vector sets to multiple instances
The fundamental way vector sets can be scaled to very large data sets
and to many Redis instances is that a given very large set of vectors
can be partitioned into N different Redis keys, that can also live into
different Redis instances.
For instance, I could add my elements into `key0`, `key1`, `key2`, by hashing
the item in some way, like doing `crc32(item)%3`, effectively splitting
the dataset into three different parts. However once I want all the vectors
of my dataset near to a given query vector, I could simply perform the
`VSIM` command against all the three keys, merging the results by
score (so the commands must be called using the `WITHSCORES` option) on
the client side: once the union of the results are ordered by the
similarity score, the query is equivalent to having a single key `key1+2+3`
containing all the items.
There are a few interesting facts to note about this pattern:
1. It is possible to have a logical sorted set that is as big as the sum of all the Redis instances we are using.
2. Deletion operations remain simple, we can hash the key and select the key where our item belongs.
3. However, even if I use 10 different Redis instances, I'm not going to reach 10x the **read** operations per second, compared to using a single server: for each logical query, I need to query all the instances. Yet, smaller graphs are faster to navigate, so there is some win even from the point of view of CPU usage.
4. Insertions, so **write** queries, will be scaled linearly: I can add N items against N instances at the same time, splitting the insertion load evenly. This is very important since vector sets, being based on HNSW data structures, are slower to add items than to query similar items, by a very big factor.
5. While it cannot guarantee always the best results, with proper timeout management this system may be considered *highly available*, since if a subset of N instances are reachable, I'll be still be able to return similar items to my query vector.
Notably, this pattern can be implemented in a way that avoids paying the sum of the round trip time with all the servers: it is possible to send the queries at the same time to all the instances, so that latency will be equal the slower reply out of of the N servers queries.
# Optimizing memory usage
Vector Sets, or better, HNSWs, the underlying data structure used by Vector Sets, combined with the features provided by the Vector Sets themselves (quantization, random projection, filtering, ...) form an implementation that has a non-trivial space of parameters that can be tuned. Despite to the complexity of the implementation and of vector similarity problems, here there is a list of simple ideas that can drive the user to pick the best settings:
* 8 bit quantization (the default) is almost always a win. It reduces the memory usage of vectors by a factor of 4, yet the performance penality in terms of recall is minimal. It also reduces insertion and search time by around 2 times or more.
* Binary quantization is much more extreme: it makes vector sets a lot faster, but increases the recall error in a sensible way, for instance from 95% to 80% if all the parameters remain the same. Yet, the speedup is really big, and the memory usage of vectors, compaerd to full precision vectors, 32 times smaller.
* Vectors memory usage are not the only responsible for Vector Set high memory usage per entry: nodes contain, on average `M*2 + M*0.33` pointers, where M is by default 16 (but can be tuned in `VADD`, see the `M` option). Also each node has the string item and the optional JSON attributes: those should be as small as possible in order to avoid contributing more to the memory usage.
* The `M` parameter should be incresed to 32 or more only when a near perfect recall is really needed.
* It is possible to gain space (less memory usage) sacrificing time (more CPU time) by using a low `M` (the default of 16, for instance) and a high `EF` (the effort parameter of `VSIM`) in order to scan the graph more deeply.
* When memory usage is seriosu concern, and there is the suspect the vectors we are storing don't contain as much information - at least for our use case - to justify the number of components they feature, random projection (the `REDUCE` option of `VADD`) could be tested to see if dimensionality reduction is possible with acceptable precision loss.
## Random projection tradeoffs
Sometimes learned vectors are not as information dense as we could guess, that
is there are components having similar meanings in the space, and components
having values that don't really represent features that matter in our use case.
At the same time, certain vectors are very big, 1024 components or more. In this cases, it is possible to use the random projection feature of Redis Vector Sets in order to reduce both space (less RAM used) and space (more operstions per second). The feature is accessible via the `REDUCE` option of the `VADD` command. However, keep in mind that you need to test how much reduction impacts the performances of your vectors in term of recall and quality of the results you get back.
## What is a random projection?
The concept of Random Projection is relatively simple to grasp. For instance, a projection that turns a 100 components vector into a 10 components vector will perform a different linear transformation between the 100 components and each of the target 10 components. Please note that *each of the target components* will get some random amount of all the 100 original components. It is mathematically proved that this process results in a vector space where elements still have similar distances among them, but still some information will get lost.
## Examples of projections and loss of precision
To show you a bit of a extreme case, let's take Word2Vec 3 million items and compress them from 300 to 100, 50 and 25 components vectors. Then, we check the recall compared to the ground truth against each of the vector sets produced in this way (using different `REDUCE` parameters of `VADD`). This is the result, obtained asking for the top 10 elements.
```
----------------------------------------------------------------------
Key Average Recall % Std Dev
----------------------------------------------------------------------
word_embeddings_int8 95.98 12.14
^ This is the same key used for ground truth, but without TRUTH option
word_embeddings_reduced_100 40.20 20.13
word_embeddings_reduced_50 24.42 16.89
word_embeddings_reduced_25 14.31 9.99
```
Here the dimensionality reduction we are using is quite extreme: from 300 to 100 means that 66.6% of the original information is lost. The recall drops from 96% to 40%, down to 24% and 14% for even more extreme dimension reduction.
Reducing the dimension of vectors that are already relatively small, like the above example, of 300 components, will provide only relatively small memory savings, especially because by default Vector Sets use `int8` quantization, that will use only one byte per component:
```
> MEMORY USAGE word_embeddings_int8
(integer) 3107002888
> MEMORY USAGE word_embeddings_reduced_100
(integer) 2507122888
```
Of course going, for example, from 2048 component vectors to 1024 would provide a much more sensible memory saving, even with the `int8` quantization used by Vector Sets, assuming the recall loss is acceptable. Other than the memory saving, there is also the reduction in CPU time, translating to more operations per second.
Another thing to note is that, with certain embedding models, binary quantization (that offers a 8x reduction of memory usage compared to 8 bit quants, and a very big speedup in computation) performs much better than reducing the dimension of vectors of the same amount via random projections:
```
word_embeddings_bin 35.48 19.78
```
Here in the same test did above: we have a 35% recall which is not too far than the 40% obtained with a random projection from 300 to 100 components. However, while the first technique reduces the size by 3 times, the size reduced of binary quantization is by 8 times.
```
> memory usage word_embeddings_bin
(integer) 2327002888
```
In this specific case the key uses JSON attributes and has a graph connection overhead that is much bigger than the 300 bits each vector takes, but, as already said, for big vectors (1024 components, for instance) or for lower values of `M` (see `VADD`, the `M` parameter connects the level of connectivity, so it changes the amount of pointers used per node) the memory saving is much stronger.
# Vector Sets troubleshooting and understandability
## Debugging poor recall or unexpected results
Vector graphs and similarity queries pose many challenges mainly due to the following three problems:
1. The error due to the approximated nature of Vector Sets is hard to evaluate.
2. The error added by the quantization is often depends on the exact vector space (the embedding we are using **and** how far apart the elements we represent into such embeddings are).
3. We live in the illusion that learned embeddings capture the best similarity possible among elements, which is obviously not always true, and highly application dependent.
The only way to debug such problems, is the ability to inspect step by step what is happening inside our application, and the structure of the HNSW graph itself. To do so, we suggest to consider the following tools:
1. The `TRUTH` option of the `VSIM` command is able to return the ground truth of the most similar elements, without using the HNSW graph, but doing a linear scan.
2. The `VLINKS` command allows to explore the graph to see if the connections among nodes make sense, and to investigate why a given node may be more isolated than expected. Such command can also be used in a different way, when we want very fast "similar items" without paying the HNSW traversal time. It exploits the fact that we have a direct reference from each element in our vector set to each node in our HNSW graph.
3. The `WITHSCORES` option, in the supported commands, return a value that is directly related to the *cosine similarity* between the query and the items vectors, the interval of the similarity is simply rescaled from the -1, 1 original range to 0, 1, otherwise the metric is identical.
## Clients, latency and bandwidth usage
During Vector Sets testing, we discovered that often clients introduce considerable latecy and CPU usage (in the client side, not in Redis) for two main reasons:
1. Often the serialization to `VALUES ... list of floats ...` can be very slow.
2. The vector payload of floats represented as strings is very large, resulting in high bandwidth usage and latency, compared to other Redis commands.
Switching from `VALUES` to `FP32` as a method for transmitting vectors may easily provide 10-20x speedups.
# Known bugs
* Replication code is pretty much untested, and very vanilla (replicating the commands verbatim).
# Implementation details
Vector sets are based on the `hnsw.c` implementation of the HNSW data structure with extensions for speed and functionality.
The main features are:
* Proper nodes deletion with relinking.
* 8 bits and binary quantization.
* Threaded queries.
* Filtered search with predicate callback.

3164
modules/vector-sets/cJSON.c Normal file

File diff suppressed because it is too large Load diff

306
modules/vector-sets/cJSON.h Normal file
View file

@ -0,0 +1,306 @@
/*
Copyright (c) 2009-2017 Dave Gamble and cJSON contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef cJSON__h
#define cJSON__h
#ifdef __cplusplus
extern "C"
{
#endif
#if !defined(__WINDOWS__) && (defined(WIN32) || defined(WIN64) || defined(_MSC_VER) || defined(_WIN32))
#define __WINDOWS__
#endif
#ifdef __WINDOWS__
/* When compiling for windows, we specify a specific calling convention to avoid issues where we are being called from a project with a different default calling convention. For windows you have 3 define options:
CJSON_HIDE_SYMBOLS - Define this in the case where you don't want to ever dllexport symbols
CJSON_EXPORT_SYMBOLS - Define this on library build when you want to dllexport symbols (default)
CJSON_IMPORT_SYMBOLS - Define this if you want to dllimport symbol
For *nix builds that support visibility attribute, you can define similar behavior by
setting default visibility to hidden by adding
-fvisibility=hidden (for gcc)
or
-xldscope=hidden (for sun cc)
to CFLAGS
then using the CJSON_API_VISIBILITY flag to "export" the same symbols the way CJSON_EXPORT_SYMBOLS does
*/
#define CJSON_CDECL __cdecl
#define CJSON_STDCALL __stdcall
/* export symbols by default, this is necessary for copy pasting the C and header file */
#if !defined(CJSON_HIDE_SYMBOLS) && !defined(CJSON_IMPORT_SYMBOLS) && !defined(CJSON_EXPORT_SYMBOLS)
#define CJSON_EXPORT_SYMBOLS
#endif
#if defined(CJSON_HIDE_SYMBOLS)
#define CJSON_PUBLIC(type) type CJSON_STDCALL
#elif defined(CJSON_EXPORT_SYMBOLS)
#define CJSON_PUBLIC(type) __declspec(dllexport) type CJSON_STDCALL
#elif defined(CJSON_IMPORT_SYMBOLS)
#define CJSON_PUBLIC(type) __declspec(dllimport) type CJSON_STDCALL
#endif
#else /* !__WINDOWS__ */
#define CJSON_CDECL
#define CJSON_STDCALL
#if (defined(__GNUC__) || defined(__SUNPRO_CC) || defined (__SUNPRO_C)) && defined(CJSON_API_VISIBILITY)
#define CJSON_PUBLIC(type) __attribute__((visibility("default"))) type
#else
#define CJSON_PUBLIC(type) type
#endif
#endif
/* project version */
#define CJSON_VERSION_MAJOR 1
#define CJSON_VERSION_MINOR 7
#define CJSON_VERSION_PATCH 18
#include <stddef.h>
/* cJSON Types: */
#define cJSON_Invalid (0)
#define cJSON_False (1 << 0)
#define cJSON_True (1 << 1)
#define cJSON_NULL (1 << 2)
#define cJSON_Number (1 << 3)
#define cJSON_String (1 << 4)
#define cJSON_Array (1 << 5)
#define cJSON_Object (1 << 6)
#define cJSON_Raw (1 << 7) /* raw json */
#define cJSON_IsReference 256
#define cJSON_StringIsConst 512
/* The cJSON structure: */
typedef struct cJSON
{
/* next/prev allow you to walk array/object chains. Alternatively, use GetArraySize/GetArrayItem/GetObjectItem */
struct cJSON *next;
struct cJSON *prev;
/* An array or object item will have a child pointer pointing to a chain of the items in the array/object. */
struct cJSON *child;
/* The type of the item, as above. */
int type;
/* The item's string, if type==cJSON_String and type == cJSON_Raw */
char *valuestring;
/* writing to valueint is DEPRECATED, use cJSON_SetNumberValue instead */
int valueint;
/* The item's number, if type==cJSON_Number */
double valuedouble;
/* The item's name string, if this item is the child of, or is in the list of subitems of an object. */
char *string;
} cJSON;
typedef struct cJSON_Hooks
{
/* malloc/free are CDECL on Windows regardless of the default calling convention of the compiler, so ensure the hooks allow passing those functions directly. */
void *(CJSON_CDECL *malloc_fn)(size_t sz);
void (CJSON_CDECL *free_fn)(void *ptr);
} cJSON_Hooks;
typedef int cJSON_bool;
/* Limits how deeply nested arrays/objects can be before cJSON rejects to parse them.
* This is to prevent stack overflows. */
#ifndef CJSON_NESTING_LIMIT
#define CJSON_NESTING_LIMIT 1000
#endif
/* Limits the length of circular references can be before cJSON rejects to parse them.
* This is to prevent stack overflows. */
#ifndef CJSON_CIRCULAR_LIMIT
#define CJSON_CIRCULAR_LIMIT 10000
#endif
/* returns the version of cJSON as a string */
CJSON_PUBLIC(const char*) cJSON_Version(void);
/* Supply malloc, realloc and free functions to cJSON */
CJSON_PUBLIC(void) cJSON_InitHooks(cJSON_Hooks* hooks);
/* Memory Management: the caller is always responsible to free the results from all variants of cJSON_Parse (with cJSON_Delete) and cJSON_Print (with stdlib free, cJSON_Hooks.free_fn, or cJSON_free as appropriate). The exception is cJSON_PrintPreallocated, where the caller has full responsibility of the buffer. */
/* Supply a block of JSON, and this returns a cJSON object you can interrogate. */
CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value);
CJSON_PUBLIC(cJSON *) cJSON_ParseWithLength(const char *value, size_t buffer_length);
/* ParseWithOpts allows you to require (and check) that the JSON is null terminated, and to retrieve the pointer to the final byte parsed. */
/* If you supply a ptr in return_parse_end and parsing fails, then return_parse_end will contain a pointer to the error so will match cJSON_GetErrorPtr(). */
CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cJSON_bool require_null_terminated);
CJSON_PUBLIC(cJSON *) cJSON_ParseWithLengthOpts(const char *value, size_t buffer_length, const char **return_parse_end, cJSON_bool require_null_terminated);
/* Render a cJSON entity to text for transfer/storage. */
CJSON_PUBLIC(char *) cJSON_Print(const cJSON *item);
/* Render a cJSON entity to text for transfer/storage without any formatting. */
CJSON_PUBLIC(char *) cJSON_PrintUnformatted(const cJSON *item);
/* Render a cJSON entity to text using a buffered strategy. prebuffer is a guess at the final size. guessing well reduces reallocation. fmt=0 gives unformatted, =1 gives formatted */
CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cJSON_bool fmt);
/* Render a cJSON entity to text using a buffer already allocated in memory with given length. Returns 1 on success and 0 on failure. */
/* NOTE: cJSON is not always 100% accurate in estimating how much memory it will use, so to be safe allocate 5 bytes more than you actually need */
CJSON_PUBLIC(cJSON_bool) cJSON_PrintPreallocated(cJSON *item, char *buffer, const int length, const cJSON_bool format);
/* Delete a cJSON entity and all subentities. */
CJSON_PUBLIC(void) cJSON_Delete(cJSON *item);
/* Returns the number of items in an array (or object). */
CJSON_PUBLIC(int) cJSON_GetArraySize(const cJSON *array);
/* Retrieve item number "index" from array "array". Returns NULL if unsuccessful. */
CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int index);
/* Get item "string" from object. Case insensitive. */
CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON * const object, const char * const string);
CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON * const object, const char * const string);
CJSON_PUBLIC(cJSON_bool) cJSON_HasObjectItem(const cJSON *object, const char *string);
/* For analysing failed parses. This returns a pointer to the parse error. You'll probably need to look a few chars back to make sense of it. Defined when cJSON_Parse() returns 0. 0 when cJSON_Parse() succeeds. */
CJSON_PUBLIC(const char *) cJSON_GetErrorPtr(void);
/* Check item type and return its value */
CJSON_PUBLIC(char *) cJSON_GetStringValue(const cJSON * const item);
CJSON_PUBLIC(double) cJSON_GetNumberValue(const cJSON * const item);
/* These functions check the type of an item */
CJSON_PUBLIC(cJSON_bool) cJSON_IsInvalid(const cJSON * const item);
CJSON_PUBLIC(cJSON_bool) cJSON_IsFalse(const cJSON * const item);
CJSON_PUBLIC(cJSON_bool) cJSON_IsTrue(const cJSON * const item);
CJSON_PUBLIC(cJSON_bool) cJSON_IsBool(const cJSON * const item);
CJSON_PUBLIC(cJSON_bool) cJSON_IsNull(const cJSON * const item);
CJSON_PUBLIC(cJSON_bool) cJSON_IsNumber(const cJSON * const item);
CJSON_PUBLIC(cJSON_bool) cJSON_IsString(const cJSON * const item);
CJSON_PUBLIC(cJSON_bool) cJSON_IsArray(const cJSON * const item);
CJSON_PUBLIC(cJSON_bool) cJSON_IsObject(const cJSON * const item);
CJSON_PUBLIC(cJSON_bool) cJSON_IsRaw(const cJSON * const item);
/* These calls create a cJSON item of the appropriate type. */
CJSON_PUBLIC(cJSON *) cJSON_CreateNull(void);
CJSON_PUBLIC(cJSON *) cJSON_CreateTrue(void);
CJSON_PUBLIC(cJSON *) cJSON_CreateFalse(void);
CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cJSON_bool boolean);
CJSON_PUBLIC(cJSON *) cJSON_CreateNumber(double num);
CJSON_PUBLIC(cJSON *) cJSON_CreateString(const char *string);
/* raw json */
CJSON_PUBLIC(cJSON *) cJSON_CreateRaw(const char *raw);
CJSON_PUBLIC(cJSON *) cJSON_CreateArray(void);
CJSON_PUBLIC(cJSON *) cJSON_CreateObject(void);
/* Create a string where valuestring references a string so
* it will not be freed by cJSON_Delete */
CJSON_PUBLIC(cJSON *) cJSON_CreateStringReference(const char *string);
/* Create an object/array that only references it's elements so
* they will not be freed by cJSON_Delete */
CJSON_PUBLIC(cJSON *) cJSON_CreateObjectReference(const cJSON *child);
CJSON_PUBLIC(cJSON *) cJSON_CreateArrayReference(const cJSON *child);
/* These utilities create an Array of count items.
* The parameter count cannot be greater than the number of elements in the number array, otherwise array access will be out of bounds.*/
CJSON_PUBLIC(cJSON *) cJSON_CreateIntArray(const int *numbers, int count);
CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count);
CJSON_PUBLIC(cJSON *) cJSON_CreateDoubleArray(const double *numbers, int count);
CJSON_PUBLIC(cJSON *) cJSON_CreateStringArray(const char *const *strings, int count);
/* Append item to the specified array/object. */
CJSON_PUBLIC(cJSON_bool) cJSON_AddItemToArray(cJSON *array, cJSON *item);
CJSON_PUBLIC(cJSON_bool) cJSON_AddItemToObject(cJSON *object, const char *string, cJSON *item);
/* Use this when string is definitely const (i.e. a literal, or as good as), and will definitely survive the cJSON object.
* WARNING: When this function was used, make sure to always check that (item->type & cJSON_StringIsConst) is zero before
* writing to `item->string` */
CJSON_PUBLIC(cJSON_bool) cJSON_AddItemToObjectCS(cJSON *object, const char *string, cJSON *item);
/* Append reference to item to the specified array/object. Use this when you want to add an existing cJSON to a new cJSON, but don't want to corrupt your existing cJSON. */
CJSON_PUBLIC(cJSON_bool) cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item);
CJSON_PUBLIC(cJSON_bool) cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item);
/* Remove/Detach items from Arrays/Objects. */
CJSON_PUBLIC(cJSON *) cJSON_DetachItemViaPointer(cJSON *parent, cJSON * const item);
CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which);
CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which);
CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObject(cJSON *object, const char *string);
CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObjectCaseSensitive(cJSON *object, const char *string);
CJSON_PUBLIC(void) cJSON_DeleteItemFromObject(cJSON *object, const char *string);
CJSON_PUBLIC(void) cJSON_DeleteItemFromObjectCaseSensitive(cJSON *object, const char *string);
/* Update array items. */
CJSON_PUBLIC(cJSON_bool) cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newitem); /* Shifts pre-existing items to the right. */
CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemViaPointer(cJSON * const parent, cJSON * const item, cJSON * replacement);
CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newitem);
CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemInObject(cJSON *object,const char *string,cJSON *newitem);
CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemInObjectCaseSensitive(cJSON *object,const char *string,cJSON *newitem);
/* Duplicate a cJSON item */
CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cJSON_bool recurse);
/* Duplicate will create a new, identical cJSON item to the one you pass, in new memory that will
* need to be released. With recurse!=0, it will duplicate any children connected to the item.
* The item->next and ->prev pointers are always zero on return from Duplicate. */
/* Recursively compare two cJSON items for equality. If either a or b is NULL or invalid, they will be considered unequal.
* case_sensitive determines if object keys are treated case sensitive (1) or case insensitive (0) */
CJSON_PUBLIC(cJSON_bool) cJSON_Compare(const cJSON * const a, const cJSON * const b, const cJSON_bool case_sensitive);
/* Minify a strings, remove blank characters(such as ' ', '\t', '\r', '\n') from strings.
* The input pointer json cannot point to a read-only address area, such as a string constant,
* but should point to a readable and writable address area. */
CJSON_PUBLIC(void) cJSON_Minify(char *json);
/* Helper functions for creating and adding items to an object at the same time.
* They return the added item or NULL on failure. */
CJSON_PUBLIC(cJSON*) cJSON_AddNullToObject(cJSON * const object, const char * const name);
CJSON_PUBLIC(cJSON*) cJSON_AddTrueToObject(cJSON * const object, const char * const name);
CJSON_PUBLIC(cJSON*) cJSON_AddFalseToObject(cJSON * const object, const char * const name);
CJSON_PUBLIC(cJSON*) cJSON_AddBoolToObject(cJSON * const object, const char * const name, const cJSON_bool boolean);
CJSON_PUBLIC(cJSON*) cJSON_AddNumberToObject(cJSON * const object, const char * const name, const double number);
CJSON_PUBLIC(cJSON*) cJSON_AddStringToObject(cJSON * const object, const char * const name, const char * const string);
CJSON_PUBLIC(cJSON*) cJSON_AddRawToObject(cJSON * const object, const char * const name, const char * const raw);
CJSON_PUBLIC(cJSON*) cJSON_AddObjectToObject(cJSON * const object, const char * const name);
CJSON_PUBLIC(cJSON*) cJSON_AddArrayToObject(cJSON * const object, const char * const name);
/* When assigning an integer value, it needs to be propagated to valuedouble too. */
#define cJSON_SetIntValue(object, number) ((object) ? (object)->valueint = (object)->valuedouble = (number) : (number))
/* helper for the cJSON_SetNumberValue macro */
CJSON_PUBLIC(double) cJSON_SetNumberHelper(cJSON *object, double number);
#define cJSON_SetNumberValue(object, number) ((object != NULL) ? cJSON_SetNumberHelper(object, (double)number) : (number))
/* Change the valuestring of a cJSON_String object, only takes effect when type of object is cJSON_String */
CJSON_PUBLIC(char*) cJSON_SetValuestring(cJSON *object, const char *valuestring);
/* If the object is not a boolean type this does nothing and returns cJSON_Invalid else it returns the new type*/
#define cJSON_SetBoolValue(object, boolValue) ( \
(object != NULL && ((object)->type & (cJSON_False|cJSON_True))) ? \
(object)->type=((object)->type &(~(cJSON_False|cJSON_True)))|((boolValue)?cJSON_True:cJSON_False) : \
cJSON_Invalid\
)
/* Macro for iterating over an array or object */
#define cJSON_ArrayForEach(element, array) for(element = (array != NULL) ? (array)->child : NULL; element != NULL; element = element->next)
/* malloc/free objects using the malloc/free functions that have been set with cJSON_InitHooks */
CJSON_PUBLIC(void *) cJSON_malloc(size_t size);
CJSON_PUBLIC(void) cJSON_free(void *object);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1 @@
venv

View file

@ -0,0 +1,44 @@
This tool is similar to redis-cli (but very basic) but allows
to specify arguments that are expanded as vectors by calling
ollama to get the embedding.
Whatever is passed as !"foo bar" gets expanded into
VALUES ... embedding ...
You must have ollama running with the mxbai-emb-large model
already installed for this to work.
Example:
redis> KEYS *
1) food_items
2) glove_embeddings_bin
3) many_movies_mxbai-embed-large_BIN
4) many_movies_mxbai-embed-large_NOQUANT
5) word_embeddings
6) word_embeddings_bin
7) glove_embeddings_fp32
redis> VSIM food_items !"drinks with fruit"
1) (Fruit)Juices,Lemonade,100ml,50 cal,210 kJ
2) (Fruit)Juices,Limeade,100ml,128 cal,538 kJ
3) CannedFruit,Canned Fruit Cocktail,100g,81 cal,340 kJ
4) (Fruit)Juices,Energy-Drink,100ml,87 cal,365 kJ
5) Fruits,Lime,100g,30 cal,126 kJ
6) (Fruit)Juices,Coconut Water,100ml,19 cal,80 kJ
7) Fruits,Lemon,100g,29 cal,122 kJ
8) (Fruit)Juices,Clamato,100ml,60 cal,252 kJ
9) Fruits,Fruit salad,100g,50 cal,210 kJ
10) (Fruit)Juices,Capri-Sun,100ml,41 cal,172 kJ
redis> vsim food_items !"barilla"
1) Pasta&Noodles,Spirelli,100g,367 cal,1541 kJ
2) Pasta&Noodles,Farfalle,100g,358 cal,1504 kJ
3) Pasta&Noodles,Capellini,100g,353 cal,1483 kJ
4) Pasta&Noodles,Spaetzle,100g,368 cal,1546 kJ
5) Pasta&Noodles,Cappelletti,100g,164 cal,689 kJ
6) Pasta&Noodles,Penne,100g,351 cal,1474 kJ
7) Pasta&Noodles,Shells,100g,353 cal,1483 kJ
8) Pasta&Noodles,Linguine,100g,357 cal,1499 kJ
9) Pasta&Noodles,Rotini,100g,353 cal,1483 kJ
10) Pasta&Noodles,Rigatoni,100g,353 cal,1483 kJ

View file

@ -0,0 +1,138 @@
#!/usr/bin/env python3
import redis
import requests
import re
import shlex
from prompt_toolkit import PromptSession
from prompt_toolkit.history import InMemoryHistory
def get_embedding(text):
"""Get embedding from local Ollama API"""
url = "http://localhost:11434/api/embeddings"
payload = {
"model": "mxbai-embed-large",
"prompt": text
}
try:
response = requests.post(url, json=payload)
response.raise_for_status()
return response.json()['embedding']
except requests.exceptions.RequestException as e:
raise Exception(f"Failed to get embedding: {str(e)}")
def process_embedding_patterns(text):
"""Process !"text" and !!"text" patterns in the command"""
def replace_with_embedding(match):
text = match.group(1)
embedding = get_embedding(text)
return f"VALUES {len(embedding)} {' '.join(map(str, embedding))}"
def replace_with_embedding_and_text(match):
text = match.group(1)
embedding = get_embedding(text)
# Return both the embedding values and the original text as next argument
return f'VALUES {len(embedding)} {" ".join(map(str, embedding))} "{text}"'
# First handle !!"text" pattern (must be done before !"text")
text = re.sub(r'!!"([^"]*)"', replace_with_embedding_and_text, text)
# Then handle !"text" pattern
text = re.sub(r'!"([^"]*)"', replace_with_embedding, text)
return text
def parse_command(command):
"""Parse command respecting quoted strings"""
try:
# Use shlex to properly handle quoted strings
return shlex.split(command)
except ValueError as e:
raise Exception(f"Invalid command syntax: {str(e)}")
def format_response(response):
"""Format the response to match Redis protocol style"""
if response is None:
return "(nil)"
elif isinstance(response, bool):
return "+OK" if response else "(error) Operation failed"
elif isinstance(response, (list, set)):
if not response:
return "(empty list or set)"
return "\n".join(f"{i+1}) {item}" for i, item in enumerate(response))
elif isinstance(response, int):
return f"(integer) {response}"
else:
return str(response)
def main():
# Default connection to localhost:6379
r = redis.Redis(host='localhost', port=6379, decode_responses=True)
try:
# Test connection
r.ping()
print("Connected to Redis. Type your commands (CTRL+D to exit):")
print("Special syntax:")
print(" !\"text\" - Replace with embedding")
print(" !!\"text\" - Replace with embedding and append text as value")
print(" \"text\" - Quote strings containing spaces")
except redis.ConnectionError:
print("Error: Could not connect to Redis server")
return
# Setup prompt session with history
session = PromptSession(history=InMemoryHistory())
# Main loop
while True:
try:
# Read input with line editing support
command = session.prompt("redis> ")
# Skip empty commands
if not command.strip():
continue
# Process any embedding patterns before parsing
try:
processed_command = process_embedding_patterns(command)
except Exception as e:
print(f"(error) Embedding processing failed: {str(e)}")
continue
# Parse the command respecting quoted strings
try:
parts = parse_command(processed_command)
except Exception as e:
print(f"(error) {str(e)}")
continue
if not parts:
continue
cmd = parts[0].lower()
args = parts[1:]
# Execute command
try:
method = getattr(r, cmd, None)
if method is not None:
result = method(*args)
else:
# Use execute_command for unknown commands
result = r.execute_command(cmd, *args)
print(format_response(result))
except AttributeError:
print(f"(error) Unknown command '{cmd}'")
except EOFError:
print("\nGoodbye!")
break
except KeyboardInterrupt:
continue # Allow Ctrl+C to clear current line
except redis.RedisError as e:
print(f"(error) {str(e)}")
except Exception as e:
print(f"(error) {str(e)}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,3 @@
wget http://ann-benchmarks.com/glove-100-angular.hdf5
python insert.py
python recall.py (use --k <count> optionally, default top-10)

View file

@ -0,0 +1,47 @@
import h5py
import redis
from tqdm import tqdm
# Initialize Redis connection
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True, encoding='utf-8')
def add_to_redis(index, embedding):
"""Add embedding to Redis using VADD command"""
args = ["VADD", "glove_embeddings", "VALUES", "100"] # 100 is vector dimension
args.extend(map(str, embedding))
args.append(f"{index}") # Using index as identifier since we don't have words
args.append("EF")
args.append("200")
# args.append("NOQUANT")
# args.append("BIN")
redis_client.execute_command(*args)
def main():
with h5py.File('glove-100-angular.hdf5', 'r') as f:
# Get the train dataset
train_vectors = f['train']
total_vectors = train_vectors.shape[0]
print(f"Starting to process {total_vectors} vectors...")
# Process in batches to avoid memory issues
batch_size = 1000
for i in tqdm(range(0, total_vectors, batch_size)):
batch_end = min(i + batch_size, total_vectors)
batch = train_vectors[i:batch_end]
for j, vector in enumerate(batch):
try:
current_index = i + j
add_to_redis(current_index, vector)
except Exception as e:
print(f"Error processing vector {current_index}: {str(e)}")
continue
if (i + batch_size) % 10000 == 0:
print(f"Processed {i + batch_size} vectors")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,78 @@
import h5py
import redis
import numpy as np
from tqdm import tqdm
import argparse
# Initialize Redis connection
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True, encoding='utf-8')
def get_redis_neighbors(query_vector, k):
"""Get nearest neighbors using Redis VSIM command"""
args = ["VSIM", "glove_embeddings_bin", "VALUES", "100"]
args.extend(map(str, query_vector))
args.extend(["COUNT", str(k)])
args.extend(["EF", 100])
if False:
print(args)
exit(1)
results = redis_client.execute_command(*args)
return [int(res) for res in results]
def calculate_recall(ground_truth, predicted, k):
"""Calculate recall@k"""
relevant = set(ground_truth[:k])
retrieved = set(predicted[:k])
return len(relevant.intersection(retrieved)) / len(relevant)
def main():
parser = argparse.ArgumentParser(description='Evaluate Redis VSIM recall')
parser.add_argument('--k', type=int, default=10, help='Number of neighbors to evaluate (default: 10)')
parser.add_argument('--batch', type=int, default=100, help='Progress update frequency (default: 100)')
args = parser.parse_args()
k = args.k
batch_size = args.batch
with h5py.File('glove-100-angular.hdf5', 'r') as f:
test_vectors = f['test'][:]
ground_truth_neighbors = f['neighbors'][:]
num_queries = len(test_vectors)
recalls = []
print(f"Evaluating recall@{k} for {num_queries} test queries...")
for i in tqdm(range(num_queries)):
try:
# Get Redis results
redis_neighbors = get_redis_neighbors(test_vectors[i], k)
# Get ground truth for this query
true_neighbors = ground_truth_neighbors[i]
# Calculate recall
recall = calculate_recall(true_neighbors, redis_neighbors, k)
recalls.append(recall)
if (i + 1) % batch_size == 0:
current_avg_recall = np.mean(recalls)
print(f"Current average recall@{k} after {i+1} queries: {current_avg_recall:.4f}")
except Exception as e:
print(f"Error processing query {i}: {str(e)}")
continue
final_recall = np.mean(recalls)
print("\nFinal Results:")
print(f"Average recall@{k}: {final_recall:.4f}")
print(f"Total queries evaluated: {len(recalls)}")
# Save detailed results
with open(f'recall_evaluation_results_k{k}.txt', 'w') as f:
f.write(f"Average recall@{k}: {final_recall:.4f}\n")
f.write(f"Total queries evaluated: {len(recalls)}\n")
f.write(f"Individual query recalls: {recalls}\n")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,2 @@
mpst_full_data.csv
partition.json

View file

@ -0,0 +1,30 @@
This example maps long form movies plots to movies titles.
It will create fp32 and binary vectors (the two extremes).
1. Install ollama, and install the embedding model "mxbai-embed-large"
2. Download mpst_full_data.csv from https://www.kaggle.com/datasets/cryptexcode/mpst-movie-plot-synopses-with-tags
3. python insert.py
127.0.0.1:6379> VSIM many_movies_mxbai-embed-large_NOQUANT ELE "The Matrix"
1) "The Matrix"
2) "The Matrix Reloaded"
3) "The Matrix Revolutions"
4) "Commando"
5) "Avatar"
6) "Forbidden Planet"
7) "Terminator Salvation"
8) "Mandroid"
9) "The Omega Code"
10) "Coherence"
127.0.0.1:6379> VSIM many_movies_mxbai-embed-large_BIN ELE "The Matrix"
1) "The Matrix"
2) "The Matrix Reloaded"
3) "The Matrix Revolutions"
4) "The Omega Code"
5) "Forbidden Planet"
6) "Avatar"
7) "John Carter"
8) "System Shock 2"
9) "Coherence"
10) "Tomorrowland"

View file

@ -0,0 +1,48 @@
import csv
import requests
import redis
ModelName="mxbai-embed-large"
# Initialize Redis connection, setting encoding to utf-8
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True, encoding='utf-8')
def get_embedding(text):
"""Get embedding from local API"""
url = "http://localhost:11434/api/embeddings"
payload = {
"model": ModelName,
"prompt": "Represent this movie plot and genre: "+text
}
response = requests.post(url, json=payload)
return response.json()['embedding']
def add_to_redis(title, embedding, quant_type):
"""Add embedding to Redis using VADD command"""
args = ["VADD", "many_movies_"+ModelName+"_"+quant_type, "VALUES", str(len(embedding))]
args.extend(map(str, embedding))
args.append(title)
args.append(quant_type)
redis_client.execute_command(*args)
def main():
with open('mpst_full_data.csv', 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
for movie in reader:
try:
text_to_embed = f"{movie['title']} {movie['plot_synopsis']} {movie['tags']}"
print(f"Getting embedding for: {movie['title']}")
embedding = get_embedding(text_to_embed)
add_to_redis(movie['title'], embedding, "BIN")
add_to_redis(movie['title'], embedding, "NOQUANT")
print(f"Successfully processed: {movie['title']}")
except Exception as e:
print(f"Error processing {movie['title']}: {str(e)}")
continue
if __name__ == "__main__":
main()

995
modules/vector-sets/expr.c Normal file
View file

@ -0,0 +1,995 @@
/* Filtering of objects based on simple expressions.
* This powers the FILTER option of Vector Sets, but it is otherwise
* general code to be used when we want to tell if a given object (with fields)
* passes or fails a given test for scalars, strings, ...
*
* Copyright(C) 2024-Present, Redis Ltd. All Rights Reserved.
* Originally authored by: Salvatore Sanfilippo.
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
#include <string.h>
#include "cJSON.h"
#ifdef TEST_MAIN
#define RedisModule_Alloc malloc
#define RedisModule_Realloc realloc
#define RedisModule_Free free
#define RedisModule_Strdup strdup
#endif
#define EXPR_TOKEN_EOF 0
#define EXPR_TOKEN_NUM 1
#define EXPR_TOKEN_STR 2
#define EXPR_TOKEN_TUPLE 3
#define EXPR_TOKEN_SELECTOR 4
#define EXPR_TOKEN_OP 5
#define EXPR_OP_OPAREN 0 /* ( */
#define EXPR_OP_CPAREN 1 /* ) */
#define EXPR_OP_NOT 2 /* ! */
#define EXPR_OP_POW 3 /* ** */
#define EXPR_OP_MULT 4 /* * */
#define EXPR_OP_DIV 5 /* / */
#define EXPR_OP_MOD 6 /* % */
#define EXPR_OP_SUM 7 /* + */
#define EXPR_OP_DIFF 8 /* - */
#define EXPR_OP_GT 9 /* > */
#define EXPR_OP_GTE 10 /* >= */
#define EXPR_OP_LT 11 /* < */
#define EXPR_OP_LTE 12 /* <= */
#define EXPR_OP_EQ 13 /* == */
#define EXPR_OP_NEQ 14 /* != */
#define EXPR_OP_IN 15 /* in */
#define EXPR_OP_AND 16 /* and */
#define EXPR_OP_OR 17 /* or */
/* This structure represents a token in our expression. It's either
* literals like 4, "foo", or operators like "+", "-", "and", or
* json selectors, that start with a dot: ".age", ".properties.somearray[1]" */
typedef struct exprtoken {
int refcount; // Reference counting for memory reclaiming.
int token_type; // Token type of the just parsed token.
int offset; // Chars offset in expression.
union {
double num; // Value for EXPR_TOKEN_NUM.
struct {
char *start; // String pointer for EXPR_TOKEN_STR / SELECTOR.
size_t len; // String len for EXPR_TOKEN_STR / SELECTOR.
char *heapstr; // True if we have a private allocation for this
// string. When possible, it just references to the
// string expression we compiled, exprstate->expr.
} str;
int opcode; // Opcode ID for EXPR_TOKEN_OP.
struct {
struct exprtoken **ele;
size_t len;
} tuple; // Tuples are like [1, 2, 3] for "in" operator.
};
} exprtoken;
/* Simple stack of expr tokens. This is used both to represent the stack
* of values and the stack of operands during VM execution. */
typedef struct exprstack {
exprtoken **items;
int numitems;
int allocsize;
} exprstack;
typedef struct exprstate {
char *expr; /* Expression string to compile. Note that
* expression token strings point directly to this
* string. */
char *p; // Currnet position inside 'expr', while parsing.
// Virtual machine state.
exprstack values_stack;
exprstack ops_stack; // Operator stack used during compilation.
exprstack tokens; // Expression processed into a sequence of tokens.
exprstack program; // Expression compiled into opcodes and values.
} exprstate;
/* Valid operators. */
struct {
char *opname;
int oplen;
int opcode;
int precedence;
int arity;
} ExprOptable[] = {
{"(", 1, EXPR_OP_OPAREN, 7, 0},
{")", 1, EXPR_OP_CPAREN, 7, 0},
{"!", 1, EXPR_OP_NOT, 6, 1},
{"not", 3, EXPR_OP_NOT, 6, 1},
{"**", 2, EXPR_OP_POW, 5, 2},
{"*", 1, EXPR_OP_MULT, 4, 2},
{"/", 1, EXPR_OP_DIV, 4, 2},
{"%", 1, EXPR_OP_MOD, 4, 2},
{"+", 1, EXPR_OP_SUM, 3, 2},
{"-", 1, EXPR_OP_DIFF, 3, 2},
{">", 1, EXPR_OP_GT, 2, 2},
{">=", 2, EXPR_OP_GTE, 2, 2},
{"<", 1, EXPR_OP_LT, 2, 2},
{"<=", 2, EXPR_OP_LTE, 2, 2},
{"==", 2, EXPR_OP_EQ, 2, 2},
{"!=", 2, EXPR_OP_NEQ, 2, 2},
{"in", 2, EXPR_OP_IN, 2, 2},
{"and", 3, EXPR_OP_AND, 1, 2},
{"&&", 2, EXPR_OP_AND, 1, 2},
{"or", 2, EXPR_OP_OR, 0, 2},
{"||", 2, EXPR_OP_OR, 0, 2},
{NULL, 0, 0, 0, 0} // Terminator.
};
#define EXPR_OP_SPECIALCHARS "+-*%/!()<>=|&"
#define EXPR_SELECTOR_SPECIALCHARS "_-"
/* ================================ Expr token ============================== */
/* Return an heap allocated token of the specified type, setting the
* reference count to 1. */
exprtoken *exprNewToken(int type) {
exprtoken *t = RedisModule_Alloc(sizeof(exprtoken));
memset(t,0,sizeof(*t));
t->token_type = type;
t->refcount = 1;
return t;
}
/* Generic free token function, can be used to free stack allocated
* objects (in this case the pointer itself will not be freed) or
* heap allocated objects. See the wrappers below. */
void exprTokenRelease(exprtoken *t) {
if (t == NULL) return;
if (t->refcount <= 0) {
printf("exprTokenRelease() against a token with refcount %d!\n"
"Aborting program execution\n",
t->refcount);
exit(1);
}
t->refcount--;
if (t->refcount > 0) return;
// We reached refcount 0: free the object.
if (t->token_type == EXPR_TOKEN_STR) {
if (t->str.heapstr != NULL) RedisModule_Free(t->str.heapstr);
} else if (t->token_type == EXPR_TOKEN_TUPLE) {
for (size_t j = 0; j < t->tuple.len; j++)
exprTokenRelease(t->tuple.ele[j]);
if (t->tuple.ele) RedisModule_Free(t->tuple.ele);
}
RedisModule_Free(t);
}
void exprTokenRetain(exprtoken *t) {
t->refcount++;
}
/* ============================== Stack handling ============================ */
#include <stdlib.h>
#include <string.h>
#define EXPR_STACK_INITIAL_SIZE 16
/* Initialize a new expression stack. */
void exprStackInit(exprstack *stack) {
stack->items = RedisModule_Alloc(sizeof(exprtoken*) * EXPR_STACK_INITIAL_SIZE);
stack->numitems = 0;
stack->allocsize = EXPR_STACK_INITIAL_SIZE;
}
/* Push a token pointer onto the stack. Does not increment the refcount
* of the token: it is up to the caller doing this. */
void exprStackPush(exprstack *stack, exprtoken *token) {
/* Check if we need to grow the stack. */
if (stack->numitems == stack->allocsize) {
size_t newsize = stack->allocsize * 2;
exprtoken **newitems =
RedisModule_Realloc(stack->items, sizeof(exprtoken*) * newsize);
stack->items = newitems;
stack->allocsize = newsize;
}
stack->items[stack->numitems] = token;
stack->numitems++;
}
/* Pop a token pointer from the stack. Return NULL if the stack is
* empty. Does NOT recrement the refcount of the token, it's up to the
* caller to do so, as the new owner of the reference. */
exprtoken *exprStackPop(exprstack *stack) {
if (stack->numitems == 0) return NULL;
stack->numitems--;
return stack->items[stack->numitems];
}
/* Just return the last element pushed, without consuming it nor altering
* the reference count. */
exprtoken *exprStackPeek(exprstack *stack) {
if (stack->numitems == 0) return NULL;
return stack->items[stack->numitems-1];
}
/* Free the stack structure state, including the items it contains, that are
* assumed to be heap allocated. The passed pointer itself is not freed. */
void exprStackFree(exprstack *stack) {
for (int j = 0; j < stack->numitems; j++)
exprTokenRelease(stack->items[j]);
RedisModule_Free(stack->items);
}
/* Just reset the stack removing all the items, but leaving it in a state
* that makes it still usable for new elements. */
void exprStackReset(exprstack *stack) {
for (int j = 0; j < stack->numitems; j++)
exprTokenRelease(stack->items[j]);
stack->numitems = 0;
}
/* =========================== Expression compilation ======================= */
void exprConsumeSpaces(exprstate *es) {
while(es->p[0] && isspace(es->p[0])) es->p++;
}
/* Parse an operator, trying to match the longer match in the
* operators table. */
exprtoken *exprParseOperator(exprstate *es) {
exprtoken *t = exprNewToken(EXPR_TOKEN_OP);
char *start = es->p;
while(es->p[0] &&
(isalpha(es->p[0]) ||
strchr(EXPR_OP_SPECIALCHARS,es->p[0]) != NULL))
{
es->p++;
}
int matchlen = es->p - start;
int bestlen = 0;
int j;
// Find the longest matching operator.
for (j = 0; ExprOptable[j].opname != NULL; j++) {
if (ExprOptable[j].oplen > matchlen) continue;
if (memcmp(ExprOptable[j].opname, start, ExprOptable[j].oplen) != 0)
{
continue;
}
if (ExprOptable[j].oplen > bestlen) {
t->opcode = ExprOptable[j].opcode;
bestlen = ExprOptable[j].oplen;
}
}
if (bestlen == 0) {
exprTokenRelease(t);
return NULL;
} else {
es->p = start + bestlen;
}
return t;
}
// Valid selector charset.
static int is_selector_char(int c) {
return (isalpha(c) ||
isdigit(c) ||
strchr(EXPR_SELECTOR_SPECIALCHARS,c) != NULL);
}
/* Parse selectors, they start with a dot and can have alphanumerical
* or few special chars. */
exprtoken *exprParseSelector(exprstate *es) {
exprtoken *t = exprNewToken(EXPR_TOKEN_SELECTOR);
es->p++; // Skip dot.
char *start = es->p;
while(es->p[0] && is_selector_char(es->p[0])) es->p++;
int matchlen = es->p - start;
t->str.start = start;
t->str.len = matchlen;
return t;
}
exprtoken *exprParseNumber(exprstate *es) {
exprtoken *t = exprNewToken(EXPR_TOKEN_NUM);
char num[64];
int idx = 0;
while(isdigit(es->p[0]) || es->p[0] == '.' || es->p[0] == 'e' ||
es->p[0] == 'E' || (idx == 0 && es->p[0] == '-'))
{
if (idx >= (int)sizeof(num)-1) {
exprTokenRelease(t);
return NULL;
}
num[idx++] = es->p[0];
es->p++;
}
num[idx] = 0;
char *endptr;
t->num = strtod(num, &endptr);
if (*endptr != '\0') {
exprTokenRelease(t);
return NULL;
}
return t;
}
exprtoken *exprParseString(exprstate *es) {
char quote = es->p[0]; /* Store the quote type (' or "). */
es->p++; /* Skip opening quote. */
exprtoken *t = exprNewToken(EXPR_TOKEN_STR);
t->str.start = es->p;
while(es->p[0] != '\0') {
if (es->p[0] == '\\' && es->p[1] != '\0') {
es->p += 2; // Skip escaped char.
continue;
}
if (es->p[0] == quote) {
t->str.len = es->p - t->str.start;
es->p++; // Skip closing quote.
return t;
}
es->p++;
}
/* If we reach here, string was not terminated. */
exprTokenRelease(t);
return NULL;
}
/* Parse a tuple of the form [1, "foo", 42]. No nested tuples are
* supported. This type is useful mostly to be used with the "IN"
* operator. */
exprtoken *exprParseTuple(exprstate *es) {
exprtoken *t = exprNewToken(EXPR_TOKEN_TUPLE);
t->tuple.ele = NULL;
t->tuple.len = 0;
es->p++; /* Skip opening '['. */
size_t allocated = 0;
while(1) {
exprConsumeSpaces(es);
/* Check for empty tuple or end. */
if (es->p[0] == ']') {
es->p++;
break;
}
/* Grow tuple array if needed. */
if (t->tuple.len == allocated) {
size_t newsize = allocated == 0 ? 4 : allocated * 2;
exprtoken **newele = RedisModule_Realloc(t->tuple.ele,
sizeof(exprtoken*) * newsize);
t->tuple.ele = newele;
allocated = newsize;
}
/* Parse tuple element. */
exprtoken *ele = NULL;
if (isdigit(es->p[0]) || es->p[0] == '-') {
ele = exprParseNumber(es);
} else if (es->p[0] == '"' || es->p[0] == '\'') {
ele = exprParseString(es);
} else {
exprTokenRelease(t);
return NULL;
}
/* Error parsing number/string? */
if (ele == NULL) {
exprTokenRelease(t);
return NULL;
}
/* Store element if no error was detected. */
t->tuple.ele[t->tuple.len] = ele;
t->tuple.len++;
/* Check for next element. */
exprConsumeSpaces(es);
if (es->p[0] == ']') {
es->p++;
break;
}
if (es->p[0] != ',') {
exprTokenRelease(t);
return NULL;
}
es->p++; /* Skip comma. */
}
return t;
}
/* Deallocate the object returned by exprCompile(). */
void exprFree(exprstate *es) {
if (es == NULL) return;
/* Free the original expression string. */
if (es->expr) RedisModule_Free(es->expr);
/* Free all stacks. */
exprStackFree(&es->values_stack);
exprStackFree(&es->ops_stack);
exprStackFree(&es->tokens);
exprStackFree(&es->program);
/* Free the state object itself. */
RedisModule_Free(es);
}
/* Split the provided expression into a stack of tokens. Returns
* 0 on success, 1 on error. */
int exprTokenize(exprstate *es, int *errpos) {
/* Main parsing loop. */
while(1) {
exprConsumeSpaces(es);
/* Set a flag to see if we can consider the - part of the
* number, or an operator. */
int minus_is_number = 0; // By default is an operator.
exprtoken *last = exprStackPeek(&es->tokens);
if (last == NULL) {
/* If we are at the start of an expression, the minus is
* considered a number. */
minus_is_number = 1;
} else if (last->token_type == EXPR_TOKEN_OP &&
last->opcode != EXPR_OP_CPAREN)
{
/* Also, if the previous token was an operator, the minus
* is considered a number, unless the previous operator is
* a closing parens. In such case it's like (...) -5, or alike
* and we want to emit an operator. */
minus_is_number = 1;
}
/* Parse based on the current character. */
exprtoken *current = NULL;
if (*es->p == '\0') {
current = exprNewToken(EXPR_TOKEN_EOF);
} else if (isdigit(*es->p) ||
(minus_is_number && *es->p == '-' && isdigit(es->p[1])))
{
current = exprParseNumber(es);
} else if (*es->p == '"' || *es->p == '\'') {
current = exprParseString(es);
} else if (*es->p == '.' && is_selector_char(es->p[1])) {
current = exprParseSelector(es);
} else if (isalpha(*es->p) || strchr(EXPR_OP_SPECIALCHARS, *es->p)) {
current = exprParseOperator(es);
} else if (*es->p == '[') {
current = exprParseTuple(es);
}
if (current == NULL) {
if (errpos) *errpos = es->p - es->expr;
return 1; // Syntax Error.
}
/* Push the current token to tokens stack. */
exprStackPush(&es->tokens, current);
if (current->token_type == EXPR_TOKEN_EOF) break;
}
return 0;
}
/* Helper function to get operator precedence from the operator table. */
int exprGetOpPrecedence(int opcode) {
for (int i = 0; ExprOptable[i].opname != NULL; i++) {
if (ExprOptable[i].opcode == opcode)
return ExprOptable[i].precedence;
}
return -1;
}
/* Helper function to get operator arity from the operator table. */
int exprGetOpArity(int opcode) {
for (int i = 0; ExprOptable[i].opname != NULL; i++) {
if (ExprOptable[i].opcode == opcode)
return ExprOptable[i].arity;
}
return -1;
}
/* Process an operator during compilation. Returns 0 on success, 1 on error.
* This function will retain a reference of the operator 'op' in case it
* is pushed on the operators stack. */
int exprProcessOperator(exprstate *es, exprtoken *op, int *stack_items, int *errpos) {
if (op->opcode == EXPR_OP_OPAREN) {
// This is just a marker for us. Do nothing.
exprStackPush(&es->ops_stack, op);
exprTokenRetain(op);
return 0;
}
if (op->opcode == EXPR_OP_CPAREN) {
/* Process operators until we find the matching opening parenthesis. */
while (1) {
exprtoken *top_op = exprStackPop(&es->ops_stack);
if (top_op == NULL) {
if (errpos) *errpos = op->offset;
return 1;
}
if (top_op->opcode == EXPR_OP_OPAREN) {
/* Open parethesis found. Our work finished. */
exprTokenRelease(top_op);
return 0;
}
int arity = exprGetOpArity(top_op->opcode);
if (*stack_items < arity) {
exprTokenRelease(top_op);
if (errpos) *errpos = top_op->offset;
return 1;
}
/* Move the operator on the program stack. */
exprStackPush(&es->program, top_op);
*stack_items = *stack_items - arity + 1;
}
}
int curr_prec = exprGetOpPrecedence(op->opcode);
/* Process operators with higher or equal precedence. */
while (1) {
exprtoken *top_op = exprStackPeek(&es->ops_stack);
if (top_op == NULL || top_op->opcode == EXPR_OP_OPAREN) break;
int top_prec = exprGetOpPrecedence(top_op->opcode);
if (top_prec < curr_prec) break;
/* Special case for **: only pop if precedence is strictly higher
* so that the operator is right associative, that is:
* 2 ** 3 ** 2 is evaluated as 2 ** (3 ** 2) == 512 instead
* of (2 ** 3) ** 2 == 64. */
if (op->opcode == EXPR_OP_POW && top_prec <= curr_prec) break;
/* Pop and add to program. */
top_op = exprStackPop(&es->ops_stack);
int arity = exprGetOpArity(top_op->opcode);
if (*stack_items < arity) {
exprTokenRelease(top_op);
if (errpos) *errpos = top_op->offset;
return 1;
}
/* Move to the program stack. */
exprStackPush(&es->program, top_op);
*stack_items = *stack_items - arity + 1;
}
/* Push current operator. */
exprStackPush(&es->ops_stack, op);
exprTokenRetain(op);
return 0;
}
/* Compile the expression into a set of push-value and exec-operator
* that exprRun() can execute. The function returns an expstate object
* that can be used for execution of the program. On error, NULL
* is returned, and optionally the position of the error into the
* expression is returned by reference. */
exprstate *exprCompile(char *expr, int *errpos) {
/* Initialize expression state. */
exprstate *es = RedisModule_Alloc(sizeof(exprstate));
es->expr = RedisModule_Strdup(expr);
es->p = es->expr;
/* Initialize all stacks. */
exprStackInit(&es->values_stack);
exprStackInit(&es->ops_stack);
exprStackInit(&es->tokens);
exprStackInit(&es->program);
/* Tokenization. */
if (exprTokenize(es, errpos)) {
exprFree(es);
return NULL;
}
/* Compile the expression into a sequence of operations. */
int stack_items = 0; // Track # of items that would be on the stack
// during execution. This way we can detect arity
// issues at compile time.
/* Process each token. */
for (int i = 0; i < es->tokens.numitems; i++) {
exprtoken *token = es->tokens.items[i];
if (token->token_type == EXPR_TOKEN_EOF) break;
/* Handle values (numbers, strings, selectors). */
if (token->token_type == EXPR_TOKEN_NUM ||
token->token_type == EXPR_TOKEN_STR ||
token->token_type == EXPR_TOKEN_TUPLE ||
token->token_type == EXPR_TOKEN_SELECTOR)
{
exprStackPush(&es->program, token);
exprTokenRetain(token);
stack_items++;
continue;
}
/* Handle operators. */
if (token->token_type == EXPR_TOKEN_OP) {
if (exprProcessOperator(es, token, &stack_items, errpos)) {
exprFree(es);
return NULL;
}
continue;
}
}
/* Process remaining operators on the stack. */
while (es->ops_stack.numitems > 0) {
exprtoken *op = exprStackPop(&es->ops_stack);
if (op->opcode == EXPR_OP_OPAREN) {
if (errpos) *errpos = op->offset;
exprTokenRelease(op);
exprFree(es);
return NULL;
}
int arity = exprGetOpArity(op->opcode);
if (stack_items < arity) {
if (errpos) *errpos = op->offset;
exprTokenRelease(op);
exprFree(es);
return NULL;
}
exprStackPush(&es->program, op);
stack_items = stack_items - arity + 1;
}
/* Verify that exactly one value would remain on the stack after
* execution. We could also check that such value is a number, but this
* would make the code more complex without much gains. */
if (stack_items != 1) {
if (errpos) {
/* Point to the last token's offset for error reporting. */
exprtoken *last = es->tokens.items[es->tokens.numitems - 1];
*errpos = last->offset;
}
exprFree(es);
return NULL;
}
return es;
}
/* ============================ Expression execution ======================== */
/* Convert a token to its numeric value. For strings we attempt to parse them
* as numbers, returning 0 if conversion fails. */
double exprTokenToNum(exprtoken *t) {
char buf[128];
if (t->token_type == EXPR_TOKEN_NUM) {
return t->num;
} else if (t->token_type == EXPR_TOKEN_STR && t->str.len < sizeof(buf)) {
memcpy(buf, t->str.start, t->str.len);
buf[t->str.len] = '\0';
char *endptr;
double val = strtod(buf, &endptr);
return *endptr == '\0' ? val : 0;
} else {
return 0;
}
}
/* Conver obejct to true/false (0 or 1) */
double exprTokenToBool(exprtoken *t) {
if (t->token_type == EXPR_TOKEN_NUM) {
return t->num != 0;
} else if (t->token_type == EXPR_TOKEN_STR && t->str.len == 0) {
return 0; // Empty string are false, like in Javascript.
} else {
return 1; // Every non numerical type is true.
}
}
/* Compare two tokens. Returns true if they are equal. */
int exprTokensEqual(exprtoken *a, exprtoken *b) {
// If both are strings, do string comparison.
if (a->token_type == EXPR_TOKEN_STR && b->token_type == EXPR_TOKEN_STR) {
return a->str.len == b->str.len &&
memcmp(a->str.start, b->str.start, a->str.len) == 0;
}
// If both are numbers, do numeric comparison.
if (a->token_type == EXPR_TOKEN_NUM && b->token_type == EXPR_TOKEN_NUM) {
return a->num == b->num;
}
// Mixed types - convert to numbers and compare.
return exprTokenToNum(a) == exprTokenToNum(b);
}
/* Convert a json object to an expression token. There is only
* limited support for JSON arrays: they must be composed of
* just numbers and strings. Returns NULL if the JSON object
* cannot be converted. */
exprtoken *exprJsonToToken(cJSON *js) {
if (cJSON_IsNumber(js)) {
exprtoken *obj = exprNewToken(EXPR_TOKEN_NUM);
obj->num = cJSON_GetNumberValue(js);
return obj;
} else if (cJSON_IsString(js)) {
exprtoken *obj = exprNewToken(EXPR_TOKEN_STR);
char *strval = cJSON_GetStringValue(js);
obj->str.heapstr = RedisModule_Strdup(strval);
obj->str.start = obj->str.heapstr;
obj->str.len = strlen(obj->str.heapstr);
return obj;
} else if (cJSON_IsBool(js)) {
exprtoken *obj = exprNewToken(EXPR_TOKEN_NUM);
obj->num = cJSON_IsTrue(js);
return obj;
} else if (cJSON_IsArray(js)) {
// First, scan the array to ensure it only
// contains strings and numbers. Otherwise the
// expression will evaluate to false.
int array_size = cJSON_GetArraySize(js);
for (int j = 0; j < array_size; j++) {
cJSON *item = cJSON_GetArrayItem(js, j);
if (!cJSON_IsNumber(item) && !cJSON_IsString(item)) return NULL;
}
// Create a tuple token for the array.
exprtoken *obj = exprNewToken(EXPR_TOKEN_TUPLE);
obj->tuple.len = array_size;
obj->tuple.ele = NULL;
if (obj->tuple.len == 0) return obj; // No elements, already ok.
obj->tuple.ele =
RedisModule_Alloc(sizeof(exprtoken*) * obj->tuple.len);
// Convert each array element to a token.
for (size_t j = 0; j < obj->tuple.len; j++) {
cJSON *item = cJSON_GetArrayItem(js, j);
if (cJSON_IsNumber(item)) {
exprtoken *eleToken = exprNewToken(EXPR_TOKEN_NUM);
eleToken->num = cJSON_GetNumberValue(item);
obj->tuple.ele[j] = eleToken;
} else if (cJSON_IsString(item)) {
exprtoken *eleToken = exprNewToken(EXPR_TOKEN_STR);
char *strval = cJSON_GetStringValue(item);
eleToken->str.heapstr = RedisModule_Strdup(strval);
eleToken->str.start = eleToken->str.heapstr;
eleToken->str.len = strlen(eleToken->str.heapstr);
obj->tuple.ele[j] = eleToken;
}
}
return obj;
}
return NULL; // No conversion possible for this type.
}
/* Execute the compiled expression program. Returns 1 if the final stack value
* evaluates to true, 0 otherwise. Also returns 0 if any selector callback
* fails. */
int exprRun(exprstate *es, char *json, size_t json_len) {
exprStackReset(&es->values_stack);
cJSON *parsed_json = NULL;
// Execute each instruction in the program.
for (int i = 0; i < es->program.numitems; i++) {
exprtoken *t = es->program.items[i];
// Handle selectors by calling the callback.
if (t->token_type == EXPR_TOKEN_SELECTOR) {
if (json != NULL) {
cJSON *attrib = NULL;
if (parsed_json == NULL) {
parsed_json = cJSON_ParseWithLength(json,json_len);
// Will be left to NULL if the above fails.
}
if (parsed_json) {
char item_name[128];
if (t->str.len > 0 && t->str.len < sizeof(item_name)) {
memcpy(item_name,t->str.start,t->str.len);
item_name[t->str.len] = 0;
attrib = cJSON_GetObjectItem(parsed_json,item_name);
}
/* Fill the token according to the JSON type stored
* at the attribute. */
if (attrib) {
exprtoken *obj = exprJsonToToken(attrib);
if (obj) {
exprStackPush(&es->values_stack, obj);
continue;
}
}
}
}
// Selector not found or JSON object not convertible to
// expression tokens. Evaluate the expression to false.
if (parsed_json) cJSON_Delete(parsed_json);
return 0;
}
// Push non-operator values directly onto the stack.
if (t->token_type != EXPR_TOKEN_OP) {
exprStackPush(&es->values_stack, t);
exprTokenRetain(t);
continue;
}
// Handle operators.
exprtoken *result = exprNewToken(EXPR_TOKEN_NUM);
// Pop operands - we know we have enough from compile-time checks.
exprtoken *b = exprStackPop(&es->values_stack);
exprtoken *a = NULL;
if (exprGetOpArity(t->opcode) == 2) {
a = exprStackPop(&es->values_stack);
}
switch(t->opcode) {
case EXPR_OP_NOT:
result->num = exprTokenToBool(b) == 0 ? 1 : 0;
break;
case EXPR_OP_POW: {
double base = exprTokenToNum(a);
double exp = exprTokenToNum(b);
result->num = pow(base, exp);
break;
}
case EXPR_OP_MULT:
result->num = exprTokenToNum(a) * exprTokenToNum(b);
break;
case EXPR_OP_DIV:
result->num = exprTokenToNum(a) / exprTokenToNum(b);
break;
case EXPR_OP_MOD: {
double va = exprTokenToNum(a);
double vb = exprTokenToNum(b);
result->num = fmod(va, vb);
break;
}
case EXPR_OP_SUM:
result->num = exprTokenToNum(a) + exprTokenToNum(b);
break;
case EXPR_OP_DIFF:
result->num = exprTokenToNum(a) - exprTokenToNum(b);
break;
case EXPR_OP_GT:
result->num = exprTokenToNum(a) > exprTokenToNum(b) ? 1 : 0;
break;
case EXPR_OP_GTE:
result->num = exprTokenToNum(a) >= exprTokenToNum(b) ? 1 : 0;
break;
case EXPR_OP_LT:
result->num = exprTokenToNum(a) < exprTokenToNum(b) ? 1 : 0;
break;
case EXPR_OP_LTE:
result->num = exprTokenToNum(a) <= exprTokenToNum(b) ? 1 : 0;
break;
case EXPR_OP_EQ:
result->num = exprTokensEqual(a, b) ? 1 : 0;
break;
case EXPR_OP_NEQ:
result->num = !exprTokensEqual(a, b) ? 1 : 0;
break;
case EXPR_OP_IN: {
// For 'in' operator, b must be a tuple.
result->num = 0; // Default to false.
if (b->token_type == EXPR_TOKEN_TUPLE) {
for (size_t j = 0; j < b->tuple.len; j++) {
if (exprTokensEqual(a, b->tuple.ele[j])) {
result->num = 1; // Found a match.
break;
}
}
}
break;
}
case EXPR_OP_AND:
result->num =
exprTokenToBool(a) != 0 && exprTokenToBool(b) != 0 ? 1 : 0;
break;
case EXPR_OP_OR:
result->num =
exprTokenToBool(a) != 0 || exprTokenToBool(b) != 0 ? 1 : 0;
break;
default:
// Do nothing: we don't want runtime errors.
break;
}
// Free operands and push result.
if (a) exprTokenRelease(a);
exprTokenRelease(b);
exprStackPush(&es->values_stack, result);
}
if (parsed_json) cJSON_Delete(parsed_json);
// Get final result from stack.
exprtoken *final = exprStackPop(&es->values_stack);
if (final == NULL) return 0;
// Convert result to boolean.
int retval = exprTokenToBool(final);
exprTokenRelease(final);
return retval;
}
/* ============================ Simple test main ============================ */
#ifdef TEST_MAIN
void exprPrintToken(exprtoken *t) {
switch(t->token_type) {
case EXPR_TOKEN_EOF:
printf("EOF");
break;
case EXPR_TOKEN_NUM:
printf("NUM:%g", t->num);
break;
case EXPR_TOKEN_STR:
printf("STR:\"%.*s\"", (int)t->str.len, t->str.start);
break;
case EXPR_TOKEN_SELECTOR:
printf("SEL:%.*s", (int)t->str.len, t->str.start);
break;
case EXPR_TOKEN_OP:
printf("OP:");
for (int i = 0; ExprOptable[i].opname != NULL; i++) {
if (ExprOptable[i].opcode == t->opcode) {
printf("%s", ExprOptable[i].opname);
break;
}
}
break;
default:
printf("UNKNOWN");
break;
}
}
void exprPrintStack(exprstack *stack, const char *name) {
printf("%s (%d items):", name, stack->numitems);
for (int j = 0; j < stack->numitems; j++) {
printf(" ");
exprPrintToken(stack->items[j]);
}
printf("\n");
}
int main(int argc, char **argv) {
char *testexpr = "(5+2)*3 and .year > 1980 and 'foo' == 'foo'";
char *testjson = "{\"year\": 1984, \"name\": \"The Matrix\"}";
if (argc >= 2) testexpr = argv[1];
if (argc >= 3) testjson = argv[2];
printf("Compiling expression: %s\n", testexpr);
int errpos = 0;
exprstate *es = exprCompile(testexpr,&errpos);
if (es == NULL) {
printf("Compilation failed near \"...%s\"\n", testexpr+errpos);
return 1;
}
exprPrintStack(&es->tokens, "Tokens");
exprPrintStack(&es->program, "Program");
printf("Running against object: %s\n", testjson);
int result = exprRun(es,testjson,strlen(testjson));
printf("Result1: %s\n", result ? "True" : "False");
result = exprRun(es,testjson,strlen(testjson));
printf("Result2: %s\n", result ? "True" : "False");
exprFree(es);
return 0;
}
#endif

2718
modules/vector-sets/hnsw.c Normal file

File diff suppressed because it is too large Load diff

183
modules/vector-sets/hnsw.h Normal file
View file

@ -0,0 +1,183 @@
/*
* HNSW (Hierarchical Navigable Small World) Implementation
* Based on the paper by Yu. A. Malkov, D. A. Yashunin
*
* Copyright(C) 2024-Pesent Redis Ltd. All Rights Reserved.
*/
#ifndef HNSW_H
#define HNSW_H
#include <pthread.h>
#include <stdatomic.h>
#define HNSW_DEFAULT_M 16 /* Used when 0 is given at creation time. */
#define HNSW_MIN_M 4 /* Probably even too low already. */
#define HNSW_MAX_M 4096 /* Safeguard sanity limit. */
#define HNSW_MAX_THREADS 32 /* Maximum number of concurrent threads */
/* Quantization types you can enable at creation time in hnsw_new() */
#define HNSW_QUANT_NONE 0 // No quantization.
#define HNSW_QUANT_Q8 1 // Q8 quantization.
#define HNSW_QUANT_BIN 2 // Binary quantization.
/* Layer structure for HNSW nodes. Each node will have from one to a few
* of this depending on its level. */
typedef struct {
struct hnswNode **links; /* Array of neighbors for this layer */
uint32_t num_links; /* Number of used links */
uint32_t max_links; /* Maximum links for this layer. We may
* reallocate the node in very particular
* conditions in order to allow linking of
* new inserted nodes, so this may change
* dynamically and be > M*2 for a small set of
* nodes. */
float worst_distance; /* Distance to the worst neighbor */
uint32_t worst_idx; /* Index of the worst neighbor */
} hnswNodeLayer;
/* Node structure for HNSW graph */
typedef struct hnswNode {
uint32_t level; /* Node's maximum level */
uint64_t id; /* Unique identifier, may be useful in order to
* have a bitmap of visited notes to use as
* alternative to epoch / visited_epoch.
* Also used in serialization in order to retain
* links specifying IDs. */
void *vector; /* The vector, quantized or not. */
float quants_range; /* Quantization range for this vector:
* min/max values will be in the range
* -quants_range, +quants_range */
float l2; /* L2 before normalization. */
/* Last time (epoch) this node was visited. We need one per thread.
* This avoids having a different data structure where we track
* visited nodes, but costs memory per node. */
uint64_t visited_epoch[HNSW_MAX_THREADS];
void *value; /* Associated value */
struct hnswNode *prev, *next; /* Prev/Next node in the list starting at
* HNSW->head. */
/* Links (and links info) per each layer. Note that this is part
* of the node allocation to be more cache friendly: reliable 3% speedup
* on Apple silicon, and does not make anything more complex. */
hnswNodeLayer layers[];
} hnswNode;
struct HNSW;
/* It is possible to navigate an HNSW with a cursor that guarantees
* visiting all the elements that remain in the HNSW from the start to the
* end of the process (but not the new ones, so that the process will
* eventually finish). Check hnsw_cursor_init(), hnsw_cursor_next() and
* hnsw_cursor_free(). */
typedef struct hnswCursor {
struct HNSW *index; // Reference to the index of this cursor.
hnswNode *current; // Element to report when hnsw_cursor_next() is called.
struct hnswCursor *next; // Next cursor active.
} hnswCursor;
/* Main HNSW index structure */
typedef struct HNSW {
hnswNode *enter_point; /* Entry point for the graph */
uint32_t M; /* M as in the paper: layer 0 has M*2 max
neighbors (M populated at insertion time)
while all the other layers have M neighbors. */
uint32_t max_level; /* Current maximum level in the graph */
uint32_t vector_dim; /* Dimensionality of stored vectors */
uint64_t node_count; /* Total number of nodes */
_Atomic uint64_t last_id; /* Last node ID used */
uint64_t current_epoch[HNSW_MAX_THREADS]; /* Current epoch for visit tracking */
hnswNode *head; /* Linked list of nodes. Last first */
/* We have two locks here:
* 1. A global_lock that is used to perform write operations blocking all
* the readers.
* 2. One mutex per epoch slot, in order for read operations to acquire
* a lock on a specific slot to use epochs tracking of visited nodes. */
pthread_rwlock_t global_lock; /* Global read-write lock */
pthread_mutex_t slot_locks[HNSW_MAX_THREADS]; /* Per-slot locks */
_Atomic uint32_t next_slot; /* Next thread slot to try */
_Atomic uint64_t version; /* Version for optimistic concurrency, this is
* incremented on deletions and entry point
* updates. */
uint32_t quant_type; /* Quantization used. HNSW_QUANT_... */
hnswCursor *cursors;
} HNSW;
/* Serialized node. This structure is used as return value of
* hnsw_serialize_node(). */
typedef struct hnswSerNode {
void *vector;
uint32_t vector_size;
uint64_t *params;
uint32_t params_count;
} hnswSerNode;
/* Insert preparation context */
typedef struct InsertContext InsertContext;
/* Core HNSW functions */
HNSW *hnsw_new(uint32_t vector_dim, uint32_t quant_type, uint32_t m);
void hnsw_free(HNSW *index,void(*free_value)(void*value));
void hnsw_node_free(hnswNode *node);
void hnsw_print_stats(HNSW *index);
hnswNode *hnsw_insert(HNSW *index, const float *vector, const int8_t *qvector,
float qrange, uint64_t id, void *value, int ef);
int hnsw_search(HNSW *index, const float *query, uint32_t k,
hnswNode **neighbors, float *distances, uint32_t slot,
int query_vector_is_normalized);
int hnsw_search_with_filter
(HNSW *index, const float *query_vector, uint32_t k,
hnswNode **neighbors, float *distances, uint32_t slot,
int query_vector_is_normalized,
int (*filter_callback)(void *value, void *privdata),
void *filter_privdata, uint32_t max_candidates);
void hnsw_get_node_vector(HNSW *index, hnswNode *node, float *vec);
int hnsw_delete_node(HNSW *index, hnswNode *node, void(*free_value)(void*value));
hnswNode *hnsw_random_node(HNSW *index, int slot);
/* Thread safety functions. */
int hnsw_acquire_read_slot(HNSW *index);
void hnsw_release_read_slot(HNSW *index, int slot);
/* Optimistic insertion API. */
InsertContext *hnsw_prepare_insert(HNSW *index, const float *vector, const int8_t *qvector, float qrange, uint64_t id, int ef);
hnswNode *hnsw_try_commit_insert(HNSW *index, InsertContext *ctx, void *value);
void hnsw_free_insert_context(InsertContext *ctx);
/* Serialization. */
hnswSerNode *hnsw_serialize_node(HNSW *index, hnswNode *node);
void hnsw_free_serialized_node(hnswSerNode *sn);
hnswNode *hnsw_insert_serialized(HNSW *index, void *vector, uint64_t *params, uint32_t params_len, void *value);
int hnsw_deserialize_index(HNSW *index);
// Helper function in case the user wants to directly copy
// the vector bytes.
uint32_t hnsw_quants_bytes(HNSW *index);
/* Cursors. */
hnswCursor *hnsw_cursor_init(HNSW *index);
void hnsw_cursor_free(hnswCursor *cursor);
hnswNode *hnsw_cursor_next(hnswCursor *cursor);
int hnsw_cursor_acquire_lock(hnswCursor *cursor);
void hnsw_cursor_release_lock(hnswCursor *cursor);
/* Allocator selection. */
void hnsw_set_allocator(void (*free_ptr)(void*), void *(*malloc_ptr)(size_t),
void *(*realloc_ptr)(void*, size_t));
/* Testing. */
int hnsw_validate_graph(HNSW *index, uint64_t *connected_nodes, int *reciprocal_links);
void hnsw_test_graph_recall(HNSW *index, int test_ef, int verbose);
float hnsw_distance(HNSW *index, hnswNode *a, hnswNode *b);
int hnsw_ground_truth_with_filter
(HNSW *index, const float *query_vector, uint32_t k,
hnswNode **neighbors, float *distances, uint32_t slot,
int query_vector_is_normalized,
int (*filter_callback)(void *value, void *privdata),
void *filter_privdata);
#endif /* HNSW_H */

File diff suppressed because it is too large Load diff

225
modules/vector-sets/test.py Executable file
View file

@ -0,0 +1,225 @@
#!/usr/bin/env python3
#
# Vector set tests.
# A Redis instance should be running in the default port.
# Copyright(C) 2024-2025 Salvatore Sanfilippo.
# All Rights Reserved.
#!/usr/bin/env python3
import redis
import random
import struct
import math
import time
import sys
import os
import importlib
import inspect
from typing import List, Tuple, Optional
from dataclasses import dataclass
def colored(text: str, color: str) -> str:
colors = {
'red': '\033[91m',
'green': '\033[92m'
}
reset = '\033[0m'
return f"{colors.get(color, '')}{text}{reset}"
@dataclass
class VectorData:
vectors: List[List[float]]
names: List[str]
def find_k_nearest(self, query_vector: List[float], k: int) -> List[Tuple[str, float]]:
"""Find k-nearest neighbors using the same scoring as Redis VSIM WITHSCORES."""
similarities = []
query_norm = math.sqrt(sum(x*x for x in query_vector))
if query_norm == 0:
return []
for i, vec in enumerate(self.vectors):
vec_norm = math.sqrt(sum(x*x for x in vec))
if vec_norm == 0:
continue
dot_product = sum(a*b for a,b in zip(query_vector, vec))
cosine_sim = dot_product / (query_norm * vec_norm)
distance = 1.0 - cosine_sim
redis_similarity = 1.0 - (distance/2.0)
similarities.append((self.names[i], redis_similarity))
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:k]
def generate_random_vector(dim: int) -> List[float]:
"""Generate a random normalized vector."""
vec = [random.gauss(0, 1) for _ in range(dim)]
norm = math.sqrt(sum(x*x for x in vec))
return [x/norm for x in vec]
def fill_redis_with_vectors(r: redis.Redis, key: str, count: int, dim: int,
with_reduce: Optional[int] = None) -> VectorData:
"""Fill Redis with random vectors and return a VectorData object for verification."""
vectors = []
names = []
r.delete(key)
for i in range(count):
vec = generate_random_vector(dim)
name = f"{key}:item:{i}"
vectors.append(vec)
names.append(name)
vec_bytes = struct.pack(f'{dim}f', *vec)
args = [key]
if with_reduce:
args.extend(['REDUCE', with_reduce])
args.extend(['FP32', vec_bytes, name])
r.execute_command('VADD', *args)
return VectorData(vectors=vectors, names=names)
class TestCase:
def __init__(self):
self.error_msg = None
self.error_details = None
self.test_key = f"test:{self.__class__.__name__.lower()}"
# Primary Redis instance (default port)
self.redis = redis.Redis()
# Replica Redis instance (port 6380)
self.replica = redis.Redis(port=6380)
# Replication status
self.replication_setup = False
def setup(self):
self.redis.delete(self.test_key)
def teardown(self):
self.redis.delete(self.test_key)
def setup_replication(self) -> bool:
"""
Setup replication between primary and replica Redis instances.
Returns True if replication is successfully established, False otherwise.
"""
# Configure replica to replicate from primary
self.replica.execute_command('REPLICAOF', '127.0.0.1', 6379)
# Wait for replication to be established
max_attempts = 10
for attempt in range(max_attempts):
# Check replication info
repl_info = self.replica.info('replication')
# Check if replication is established
if (repl_info.get('role') == 'slave' and
repl_info.get('master_host') == '127.0.0.1' and
repl_info.get('master_port') == 6379 and
repl_info.get('master_link_status') == 'up'):
self.replication_setup = True
return True
# Wait before next attempt
time.sleep(0.5)
# If we get here, replication wasn't established
self.error_msg = "Failed to establish replication between primary and replica"
return False
def test(self):
raise NotImplementedError("Subclasses must implement test method")
def run(self):
try:
self.setup()
self.test()
return True
except AssertionError as e:
self.error_msg = str(e)
import traceback
self.error_details = traceback.format_exc()
return False
except Exception as e:
self.error_msg = f"Unexpected error: {str(e)}"
import traceback
self.error_details = traceback.format_exc()
return False
finally:
self.teardown()
def getname(self):
"""Each test class should override this to provide its name"""
return self.__class__.__name__
def estimated_runtime(self):
""""Each test class should override this if it takes a significant amount of time to run. Default is 100ms"""
return 0.1
def find_test_classes():
test_classes = []
tests_dir = 'tests'
if not os.path.exists(tests_dir):
return []
for file in os.listdir(tests_dir):
if file.endswith('.py'):
module_name = f"tests.{file[:-3]}"
try:
module = importlib.import_module(module_name)
for name, obj in inspect.getmembers(module):
if inspect.isclass(obj) and obj.__name__ != 'TestCase' and hasattr(obj, 'test'):
test_classes.append(obj())
except Exception as e:
print(f"Error loading {file}: {e}")
return test_classes
def run_tests():
print("================================================\n"+
"Make sure to have Redis running in the localhost\n"+
"with --enable-debug-command yes\n"+
"Both primary (6379) and replica (6380) instances\n"+
"================================================\n")
tests = find_test_classes()
if not tests:
print("No tests found!")
return
# Sort tests by estimated runtime
tests.sort(key=lambda t: t.estimated_runtime())
passed = 0
total = len(tests)
for test in tests:
print(f"{test.getname()}: ", end="")
sys.stdout.flush()
start_time = time.time()
success = test.run()
duration = time.time() - start_time
if success:
print(colored("OK", "green"), f"({duration:.2f}s)")
passed += 1
else:
print(colored("ERR", "red"), f"({duration:.2f}s)")
print(f"Error: {test.error_msg}")
if test.error_details:
print("\nTraceback:")
print(test.error_details)
print("\n" + "="*50)
print(f"\nTest Summary: {passed}/{total} tests passed")
if passed == total:
print(colored("\nALL TESTS PASSED!", "green"))
else:
print(colored(f"\n{total-passed} TESTS FAILED!", "red"))
if __name__ == "__main__":
run_tests()

View file

@ -0,0 +1,21 @@
from test import TestCase, generate_random_vector
import struct
class BasicCommands(TestCase):
def getname(self):
return "VADD, VDIM, VCARD basic usage"
def test(self):
# Test VADD
vec = generate_random_vector(4)
vec_bytes = struct.pack('4f', *vec)
result = self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes, f'{self.test_key}:item:1')
assert result == 1, "VADD should return 1 for first item"
# Test VDIM
dim = self.redis.execute_command('VDIM', self.test_key)
assert dim == 4, f"VDIM should return 4, got {dim}"
# Test VCARD
card = self.redis.execute_command('VCARD', self.test_key)
assert card == 1, f"VCARD should return 1, got {card}"

View file

@ -0,0 +1,35 @@
from test import TestCase
class BasicSimilarity(TestCase):
def getname(self):
return "VSIM reported distance makes sense with 4D vectors"
def test(self):
# Add two very similar vectors, one different
vec1 = [1, 0, 0, 0]
vec2 = [0.99, 0.01, 0, 0]
vec3 = [0.1, 1, -1, 0.5]
# Add vectors using VALUES format
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1], f'{self.test_key}:item:1')
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec2], f'{self.test_key}:item:2')
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec3], f'{self.test_key}:item:3')
# Query similarity with vec1
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1], 'WITHSCORES')
# Convert results to dictionary
results_dict = {}
for i in range(0, len(result), 2):
key = result[i].decode()
score = float(result[i+1])
results_dict[key] = score
# Verify results
assert results_dict[f'{self.test_key}:item:1'] > 0.99, "Self-similarity should be very high"
assert results_dict[f'{self.test_key}:item:2'] > 0.99, "Similar vector should have high similarity"
assert results_dict[f'{self.test_key}:item:3'] < 0.8, "Not very similar vector should have low similarity"

View file

@ -0,0 +1,156 @@
from test import TestCase, generate_random_vector
import threading
import time
import struct
class ThreadingStressTest(TestCase):
def getname(self):
return "Concurrent VADD/DEL/VSIM operations stress test"
def estimated_runtime(self):
return 10 # Test runs for 10 seconds
def test(self):
# Constants - easy to modify if needed
NUM_VADD_THREADS = 10
NUM_VSIM_THREADS = 1
NUM_DEL_THREADS = 1
TEST_DURATION = 10 # seconds
VECTOR_DIM = 100
DEL_INTERVAL = 1 # seconds
# Shared flags and state
stop_event = threading.Event()
error_list = []
error_lock = threading.Lock()
def log_error(thread_name, error):
with error_lock:
error_list.append(f"{thread_name}: {error}")
def vadd_worker(thread_id):
"""Thread function to perform VADD operations"""
thread_name = f"VADD-{thread_id}"
try:
vector_count = 0
while not stop_event.is_set():
try:
# Generate random vector
vec = generate_random_vector(VECTOR_DIM)
vec_bytes = struct.pack(f'{VECTOR_DIM}f', *vec)
# Add vector with CAS option
self.redis.execute_command(
'VADD',
self.test_key,
'FP32',
vec_bytes,
f'{self.test_key}:item:{thread_id}:{vector_count}',
'CAS'
)
vector_count += 1
# Small sleep to reduce CPU pressure
if vector_count % 10 == 0:
time.sleep(0.001)
except Exception as e:
log_error(thread_name, f"Error: {str(e)}")
time.sleep(0.1) # Slight backoff on error
except Exception as e:
log_error(thread_name, f"Thread error: {str(e)}")
def del_worker():
"""Thread function that deletes the key periodically"""
thread_name = "DEL"
try:
del_count = 0
while not stop_event.is_set():
try:
# Sleep first, then delete
time.sleep(DEL_INTERVAL)
if stop_event.is_set():
break
self.redis.delete(self.test_key)
del_count += 1
except Exception as e:
log_error(thread_name, f"Error: {str(e)}")
except Exception as e:
log_error(thread_name, f"Thread error: {str(e)}")
def vsim_worker(thread_id):
"""Thread function to perform VSIM operations"""
thread_name = f"VSIM-{thread_id}"
try:
search_count = 0
while not stop_event.is_set():
try:
# Generate query vector
query_vec = generate_random_vector(VECTOR_DIM)
query_str = [str(x) for x in query_vec]
# Perform similarity search
args = ['VSIM', self.test_key, 'VALUES', VECTOR_DIM]
args.extend(query_str)
args.extend(['COUNT', 10])
self.redis.execute_command(*args)
search_count += 1
# Small sleep to reduce CPU pressure
if search_count % 10 == 0:
time.sleep(0.005)
except Exception as e:
# Don't log empty array errors, as they're expected when key doesn't exist
if "empty array" not in str(e).lower():
log_error(thread_name, f"Error: {str(e)}")
time.sleep(0.1) # Slight backoff on error
except Exception as e:
log_error(thread_name, f"Thread error: {str(e)}")
# Start all threads
threads = []
# VADD threads
for i in range(NUM_VADD_THREADS):
thread = threading.Thread(target=vadd_worker, args=(i,))
thread.start()
threads.append(thread)
# DEL threads
for _ in range(NUM_DEL_THREADS):
thread = threading.Thread(target=del_worker)
thread.start()
threads.append(thread)
# VSIM threads
for i in range(NUM_VSIM_THREADS):
thread = threading.Thread(target=vsim_worker, args=(i,))
thread.start()
threads.append(thread)
# Let the test run for the specified duration
time.sleep(TEST_DURATION)
# Signal all threads to stop
stop_event.set()
# Wait for threads to finish
for thread in threads:
thread.join(timeout=2.0)
# Check if Redis is still responsive
try:
ping_result = self.redis.ping()
assert ping_result, "Redis did not respond to PING after stress test"
except Exception as e:
assert False, f"Redis connection failed after stress test: {str(e)}"
# Report any errors for diagnosis, but don't fail the test unless PING fails
if error_list:
error_count = len(error_list)
print(f"\nEncountered {error_count} errors during stress test.")
print("First 5 errors:")
for error in error_list[:5]:
print(f"- {error}")

View file

@ -0,0 +1,48 @@
from test import TestCase, fill_redis_with_vectors, generate_random_vector
import threading, time
class ConcurrentVSIMAndDEL(TestCase):
def getname(self):
return "Concurrent VSIM and DEL operations"
def estimated_runtime(self):
return 2
def test(self):
# Fill the key with 5000 random vectors
dim = 128
count = 5000
fill_redis_with_vectors(self.redis, self.test_key, count, dim)
# List to store results from threads
thread_results = []
def vsim_thread():
"""Thread function to perform VSIM operations until the key is deleted"""
while True:
query_vec = generate_random_vector(dim)
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', dim,
*[str(x) for x in query_vec], 'COUNT', 10)
if not result:
# Empty array detected, key is deleted
thread_results.append(True)
break
# Start multiple threads to perform VSIM operations
threads = []
for _ in range(4): # Start 4 threads
t = threading.Thread(target=vsim_thread)
t.start()
threads.append(t)
# Delete the key while threads are still running
time.sleep(1)
self.redis.delete(self.test_key)
# Wait for all threads to finish (they will exit once they detect the key is deleted)
for t in threads:
t.join()
# Verify that all threads detected an empty array or error
assert len(thread_results) == len(threads), "Not all threads detected the key deletion"
assert all(thread_results), "Some threads did not detect an empty array or error after DEL"

View file

@ -0,0 +1,39 @@
from test import TestCase, generate_random_vector
import struct
class DebugDigestTest(TestCase):
def getname(self):
return "[regression] DEBUG DIGEST-VALUE with attributes"
def test(self):
# Generate random vectors
vec1 = generate_random_vector(4)
vec2 = generate_random_vector(4)
vec_bytes1 = struct.pack('4f', *vec1)
vec_bytes2 = struct.pack('4f', *vec2)
# Add vectors to the key, one with attribute, one without
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes1, f'{self.test_key}:item:1')
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes2, f'{self.test_key}:item:2', 'SETATTR', '{"color":"red"}')
# Call DEBUG DIGEST-VALUE on the key
try:
digest1 = self.redis.execute_command('DEBUG', 'DIGEST-VALUE', self.test_key)
assert digest1 is not None, "DEBUG DIGEST-VALUE should return a value"
# Change attribute and verify digest changes
self.redis.execute_command('VSETATTR', self.test_key, f'{self.test_key}:item:2', '{"color":"blue"}')
digest2 = self.redis.execute_command('DEBUG', 'DIGEST-VALUE', self.test_key)
assert digest2 is not None, "DEBUG DIGEST-VALUE should return a value after attribute change"
assert digest1 != digest2, "Digest should change when an attribute is modified"
# Remove attribute and verify digest changes again
self.redis.execute_command('VSETATTR', self.test_key, f'{self.test_key}:item:2', '')
digest3 = self.redis.execute_command('DEBUG', 'DIGEST-VALUE', self.test_key)
assert digest3 is not None, "DEBUG DIGEST-VALUE should return a value after attribute removal"
assert digest2 != digest3, "Digest should change when an attribute is removed"
except Exception as e:
raise AssertionError(f"DEBUG DIGEST-VALUE command failed: {str(e)}")

View file

@ -0,0 +1,173 @@
from test import TestCase, fill_redis_with_vectors, generate_random_vector
import random
"""
A note about this test:
It was experimentally tried to modify hnsw.c in order to
avoid calling hnsw_reconnect_nodes(). In this case, the test
fails very often with EF set to 250, while it hardly
fails at all with the same parameters if hnsw_reconnect_nodes()
is called.
Note that for the nature of the test (it is very strict) it can
still fail from time to time, without this signaling any
actual bug.
"""
class VREM(TestCase):
def getname(self):
return "Deletion and graph state after deletion"
def estimated_runtime(self):
return 2.0
def format_neighbors_with_scores(self, links_result, old_links=None, items_to_remove=None):
"""Format neighbors with their similarity scores and status indicators"""
if not links_result:
return "No neighbors"
output = []
for level, neighbors in enumerate(links_result):
level_num = len(links_result) - level - 1
output.append(f"Level {level_num}:")
# Get neighbors and scores
neighbors_with_scores = []
for i in range(0, len(neighbors), 2):
neighbor = neighbors[i].decode() if isinstance(neighbors[i], bytes) else neighbors[i]
score = float(neighbors[i+1]) if i+1 < len(neighbors) else None
status = ""
# For old links, mark deleted ones
if items_to_remove and neighbor in items_to_remove:
status = " [lost]"
# For new links, mark newly added ones
elif old_links is not None:
# Check if this neighbor was in the old links at this level
was_present = False
if old_links and level < len(old_links):
old_neighbors = [n.decode() if isinstance(n, bytes) else n
for n in old_links[level]]
was_present = neighbor in old_neighbors
if not was_present:
status = " [gained]"
if score is not None:
neighbors_with_scores.append(f"{len(neighbors_with_scores)+1}. {neighbor} ({score:.6f}){status}")
else:
neighbors_with_scores.append(f"{len(neighbors_with_scores)+1}. {neighbor}{status}")
output.extend([" " + n for n in neighbors_with_scores])
return "\n".join(output)
def test(self):
# 1. Fill server with random elements
dim = 128
count = 5000
data = fill_redis_with_vectors(self.redis, self.test_key, count, dim)
# 2. Do VSIM to get 200 items
query_vec = generate_random_vector(dim)
results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', dim,
*[str(x) for x in query_vec],
'COUNT', 200, 'WITHSCORES')
# Convert results to list of (item, score) pairs, sorted by score
items = []
for i in range(0, len(results), 2):
item = results[i].decode()
score = float(results[i+1])
items.append((item, score))
items.sort(key=lambda x: x[1], reverse=True) # Sort by similarity
# Store the graph structure for all items before deletion
neighbors_before = {}
for item, _ in items:
links = self.redis.execute_command('VLINKS', self.test_key, item, 'WITHSCORES')
if links: # Some items might not have links
neighbors_before[item] = links
# 3. Remove 100 random items
items_to_remove = set(item for item, _ in random.sample(items, 100))
# Keep track of top 10 non-removed items
top_remaining = []
for item, score in items:
if item not in items_to_remove:
top_remaining.append((item, score))
if len(top_remaining) == 10:
break
# Remove the items
for item in items_to_remove:
result = self.redis.execute_command('VREM', self.test_key, item)
assert result == 1, f"VREM failed to remove {item}"
# 4. Do VSIM again with same vector
new_results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', dim,
*[str(x) for x in query_vec],
'COUNT', 200, 'WITHSCORES',
'EF', 500)
# Convert new results to dict of item -> score
new_scores = {}
for i in range(0, len(new_results), 2):
item = new_results[i].decode()
score = float(new_results[i+1])
new_scores[item] = score
failure = False
failed_item = None
failed_reason = None
# 5. Verify all top 10 non-removed items are still found with similar scores
for item, old_score in top_remaining:
if item not in new_scores:
failure = True
failed_item = item
failed_reason = "missing"
break
new_score = new_scores[item]
if abs(new_score - old_score) >= 0.01:
failure = True
failed_item = item
failed_reason = f"score changed: {old_score:.6f} -> {new_score:.6f}"
break
if failure:
print("\nTest failed!")
print(f"Problem with item: {failed_item} ({failed_reason})")
print("\nOriginal neighbors (with similarity scores):")
if failed_item in neighbors_before:
print(self.format_neighbors_with_scores(
neighbors_before[failed_item],
items_to_remove=items_to_remove))
else:
print("No neighbors found in original graph")
print("\nCurrent neighbors (with similarity scores):")
current_links = self.redis.execute_command('VLINKS', self.test_key,
failed_item, 'WITHSCORES')
if current_links:
print(self.format_neighbors_with_scores(
current_links,
old_links=neighbors_before.get(failed_item)))
else:
print("No neighbors in current graph")
print("\nOriginal results (top 20):")
for item, score in items[:20]:
deleted = "[deleted]" if item in items_to_remove else ""
print(f"{item}: {score:.6f} {deleted}")
print("\nNew results after removal (top 20):")
new_items = []
for i in range(0, len(new_results), 2):
item = new_results[i].decode()
score = float(new_results[i+1])
new_items.append((item, score))
new_items.sort(key=lambda x: x[1], reverse=True)
for item, score in new_items[:20]:
print(f"{item}: {score:.6f}")
raise AssertionError(f"Test failed: Problem with item {failed_item} ({failed_reason}). *** IMPORTANT *** This test may fail from time to time without indicating that there is a bug. However normally it should pass. The fact is that it's a quite extreme test where we destroy 50% of nodes of top results and still expect perfect recall, with vectors that are very hostile because of the distribution used.")

View file

@ -0,0 +1,67 @@
from test import TestCase, generate_random_vector
import struct
import redis.exceptions
class DimensionValidation(TestCase):
def getname(self):
return "[regression] Dimension Validation with Projection"
def estimated_runtime(self):
return 0.5
def test(self):
# Test scenario 1: Create a set with projection
original_dim = 100
reduced_dim = 50
# Create the initial vector and set with projection
vec1 = generate_random_vector(original_dim)
vec1_bytes = struct.pack(f'{original_dim}f', *vec1)
# Add first vector with projection
result = self.redis.execute_command('VADD', self.test_key,
'REDUCE', reduced_dim,
'FP32', vec1_bytes, f'{self.test_key}:item:1')
assert result == 1, "First VADD with REDUCE should return 1"
# Check VINFO returns the correct projection information
info = self.redis.execute_command('VINFO', self.test_key)
info_map = {k.decode('utf-8'): v for k, v in zip(info[::2], info[1::2])}
assert 'vector-dim' in info_map, "VINFO should contain vector-dim"
assert info_map['vector-dim'] == reduced_dim, f"Expected reduced dimension {reduced_dim}, got {info['vector-dim']}"
assert 'projection-input-dim' in info_map, "VINFO should contain projection-input-dim"
assert info_map['projection-input-dim'] == original_dim, f"Expected original dimension {original_dim}, got {info['projection-input-dim']}"
# Test scenario 2: Try adding a mismatched vector - should fail
wrong_dim = 80
wrong_vec = generate_random_vector(wrong_dim)
wrong_vec_bytes = struct.pack(f'{wrong_dim}f', *wrong_vec)
# This should fail with dimension mismatch error
try:
self.redis.execute_command('VADD', self.test_key,
'REDUCE', reduced_dim,
'FP32', wrong_vec_bytes, f'{self.test_key}:item:2')
assert False, "VADD with wrong dimension should fail"
except redis.exceptions.ResponseError as e:
assert "Input dimension mismatch for projection" in str(e), f"Expected dimension mismatch error, got: {e}"
# Test scenario 3: Add a correctly-sized vector
vec2 = generate_random_vector(original_dim)
vec2_bytes = struct.pack(f'{original_dim}f', *vec2)
# This should succeed
result = self.redis.execute_command('VADD', self.test_key,
'REDUCE', reduced_dim,
'FP32', vec2_bytes, f'{self.test_key}:item:3')
assert result == 1, "VADD with correct dimensions should succeed"
# Check VSIM also validates input dimensions
wrong_query = generate_random_vector(wrong_dim)
try:
self.redis.execute_command('VSIM', self.test_key,
'VALUES', wrong_dim, *[str(x) for x in wrong_query],
'COUNT', 10)
assert False, "VSIM with wrong dimension should fail"
except redis.exceptions.ResponseError as e:
assert "Input dimension mismatch for projection" in str(e), f"Expected dimension mismatch error in VSIM, got: {e}"

View file

@ -0,0 +1,27 @@
from test import TestCase, generate_random_vector
import struct
class VREM_LastItemDeletesKey(TestCase):
def getname(self):
return "VREM last item deletes key"
def test(self):
# Generate a random vector
vec = generate_random_vector(4)
vec_bytes = struct.pack('4f', *vec)
# Add the vector to the key
result = self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes, f'{self.test_key}:item:1')
assert result == 1, "VADD should return 1 for first item"
# Verify the key exists
exists = self.redis.exists(self.test_key)
assert exists == 1, "Key should exist after VADD"
# Remove the item
result = self.redis.execute_command('VREM', self.test_key, f'{self.test_key}:item:1')
assert result == 1, "VREM should return 1 for successful removal"
# Verify the key no longer exists
exists = self.redis.exists(self.test_key)
assert exists == 0, "Key should no longer exist after VREM of last item"

View file

@ -0,0 +1,177 @@
from test import TestCase
class VSIMFilterExpressions(TestCase):
def getname(self):
return "VSIM FILTER expressions basic functionality"
def test(self):
# Create a small set of vectors with different attributes
# Basic vectors for testing - all orthogonal for clear results
vec1 = [1, 0, 0, 0]
vec2 = [0, 1, 0, 0]
vec3 = [0, 0, 1, 0]
vec4 = [0, 0, 0, 1]
vec5 = [0.5, 0.5, 0, 0]
# Add vectors with various attributes
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1], f'{self.test_key}:item:1')
self.redis.execute_command('VSETATTR', self.test_key, f'{self.test_key}:item:1',
'{"age": 25, "name": "Alice", "active": true, "scores": [85, 90, 95], "city": "New York"}')
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec2], f'{self.test_key}:item:2')
self.redis.execute_command('VSETATTR', self.test_key, f'{self.test_key}:item:2',
'{"age": 30, "name": "Bob", "active": false, "scores": [70, 75, 80], "city": "Boston"}')
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec3], f'{self.test_key}:item:3')
self.redis.execute_command('VSETATTR', self.test_key, f'{self.test_key}:item:3',
'{"age": 35, "name": "Charlie", "scores": [60, 65, 70], "city": "Seattle"}')
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec4], f'{self.test_key}:item:4')
# Item 4 has no attribute at all
self.redis.execute_command('VADD', self.test_key, 'VALUES', 4,
*[str(x) for x in vec5], f'{self.test_key}:item:5')
self.redis.execute_command('VSETATTR', self.test_key, f'{self.test_key}:item:5',
'invalid json') # Intentionally malformed JSON
# Test 1: Basic equality with numbers
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age == 25')
assert len(result) == 1, "Expected 1 result for age == 25"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1 for age == 25"
# Test 2: Greater than
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age > 25')
assert len(result) == 2, "Expected 2 results for age > 25"
# Test 3: Less than or equal
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age <= 30')
assert len(result) == 2, "Expected 2 results for age <= 30"
# Test 4: String equality
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.name == "Alice"')
assert len(result) == 1, "Expected 1 result for name == Alice"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1 for name == Alice"
# Test 5: String inequality
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.name != "Alice"')
assert len(result) == 2, "Expected 2 results for name != Alice"
# Test 6: Boolean value
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.active')
assert len(result) == 1, "Expected 1 result for .active being true"
# Test 7: Logical AND
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age > 20 and .age < 30')
assert len(result) == 1, "Expected 1 result for 20 < age < 30"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1 for 20 < age < 30"
# Test 8: Logical OR
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age < 30 or .age > 35')
assert len(result) == 1, "Expected 1 result for age < 30 or age > 35"
# Test 9: Logical NOT
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '!(.age == 25)')
assert len(result) == 2, "Expected 2 results for NOT(age == 25)"
# Test 10: The "in" operator with array
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age in [25, 35]')
assert len(result) == 2, "Expected 2 results for age in [25, 35]"
# Test 11: The "in" operator with strings in array
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.name in ["Alice", "David"]')
assert len(result) == 1, "Expected 1 result for name in [Alice, David]"
# Test 12: Arithmetic operations - addition
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age + 10 > 40')
assert len(result) == 1, "Expected 1 result for age + 10 > 40"
# Test 13: Arithmetic operations - multiplication
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age * 2 > 60')
assert len(result) == 1, "Expected 1 result for age * 2 > 60"
# Test 14: Arithmetic operations - division
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age / 5 == 5')
assert len(result) == 1, "Expected 1 result for age / 5 == 5"
# Test 15: Arithmetic operations - modulo
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age % 2 == 0')
assert len(result) == 1, "Expected 1 result for age % 2 == 0"
# Test 16: Power operator
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age ** 2 > 900')
assert len(result) == 1, "Expected 1 result for age^2 > 900"
# Test 17: Missing attribute (should exclude items missing that attribute)
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.missing_field == "value"')
assert len(result) == 0, "Expected 0 results for missing_field == value"
# Test 18: No attribute set at all
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.any_field')
assert f'{self.test_key}:item:4' not in [item.decode() for item in result], "Item with no attribute should be excluded"
# Test 19: Malformed JSON
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.any_field')
assert f'{self.test_key}:item:5' not in [item.decode() for item in result], "Item with malformed JSON should be excluded"
# Test 20: Complex expression combining multiple operators
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '(.age > 20 and .age < 40) and (.city == "Boston" or .city == "New York")')
assert len(result) == 2, "Expected 2 results for the complex expression"
expected_items = [f'{self.test_key}:item:1', f'{self.test_key}:item:2']
assert set([item.decode() for item in result]) == set(expected_items), "Expected item:1 and item:2 for the complex expression"
# Test 21: Parentheses to control operator precedence
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age > (20 + 10)')
assert len(result) == 1, "Expected 1 result for age > (20 + 10)"
# Test 22: Array access (arrays evaluate to true)
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.scores')
assert len(result) == 3, "Expected 3 results for .scores (arrays evaluate to true)"

View file

@ -0,0 +1,668 @@
from test import TestCase, generate_random_vector
import struct
import random
import math
import json
import time
class VSIMFilterAdvanced(TestCase):
def getname(self):
return "VSIM FILTER comprehensive functionality testing"
def estimated_runtime(self):
return 15 # This test might take up to 15 seconds for the large dataset
def setup(self):
super().setup()
self.dim = 32 # Vector dimension
self.count = 5000 # Number of vectors for large tests
self.small_count = 50 # Number of vectors for small/quick tests
# Categories for attributes
self.categories = ["electronics", "furniture", "clothing", "books", "food"]
self.cities = ["New York", "London", "Tokyo", "Paris", "Berlin", "Sydney", "Toronto", "Singapore"]
self.price_ranges = [(10, 50), (50, 200), (200, 1000), (1000, 5000)]
self.years = list(range(2000, 2025))
def create_attributes(self, index):
"""Create realistic attributes for a vector"""
category = random.choice(self.categories)
city = random.choice(self.cities)
min_price, max_price = random.choice(self.price_ranges)
price = round(random.uniform(min_price, max_price), 2)
year = random.choice(self.years)
in_stock = random.random() > 0.3 # 70% chance of being in stock
rating = round(random.uniform(1, 5), 1)
views = int(random.expovariate(1/1000)) # Exponential distribution for page views
tags = random.sample(["popular", "sale", "new", "limited", "exclusive", "clearance"],
k=random.randint(0, 3))
# Add some specific patterns for testing
# Every 10th item has a specific property combination for testing
is_premium = (index % 10 == 0)
# Create attributes dictionary
attrs = {
"id": index,
"category": category,
"location": city,
"price": price,
"year": year,
"in_stock": in_stock,
"rating": rating,
"views": views,
"tags": tags
}
if is_premium:
attrs["is_premium"] = True
attrs["special_features"] = ["premium", "warranty", "support"]
# Add sub-categories for more complex filters
if category == "electronics":
attrs["subcategory"] = random.choice(["phones", "computers", "cameras", "audio"])
elif category == "furniture":
attrs["subcategory"] = random.choice(["chairs", "tables", "sofas", "beds"])
elif category == "clothing":
attrs["subcategory"] = random.choice(["shirts", "pants", "dresses", "shoes"])
# Add some intentionally missing fields for testing
if random.random() > 0.9: # 10% chance of missing price
del attrs["price"]
# Some items have promotion field
if random.random() > 0.7: # 30% chance of having a promotion
attrs["promotion"] = random.choice(["discount", "bundle", "gift"])
# Create invalid JSON for a small percentage of vectors
if random.random() > 0.98: # 2% chance of having invalid JSON
return "{{invalid json}}"
return json.dumps(attrs)
def create_vectors_with_attributes(self, key, count):
"""Create vectors and add attributes to them"""
vectors = []
names = []
attribute_map = {} # To store attributes for verification
# Create vectors
for i in range(count):
vec = generate_random_vector(self.dim)
vectors.append(vec)
name = f"{key}:item:{i}"
names.append(name)
# Add to Redis
vec_bytes = struct.pack(f'{self.dim}f', *vec)
self.redis.execute_command('VADD', key, 'FP32', vec_bytes, name)
# Create and add attributes
attrs = self.create_attributes(i)
self.redis.execute_command('VSETATTR', key, name, attrs)
# Store attributes for later verification
try:
attribute_map[name] = json.loads(attrs) if '{' in attrs else None
except json.JSONDecodeError:
attribute_map[name] = None
return vectors, names, attribute_map
def filter_linear_search(self, vectors, names, query_vector, filter_expr, attribute_map, k=10):
"""Perform a linear search with filtering for verification"""
similarities = []
query_norm = math.sqrt(sum(x*x for x in query_vector))
if query_norm == 0:
return []
for i, vec in enumerate(vectors):
name = names[i]
attributes = attribute_map.get(name)
# Skip if doesn't match filter
if not self.matches_filter(attributes, filter_expr):
continue
vec_norm = math.sqrt(sum(x*x for x in vec))
if vec_norm == 0:
continue
dot_product = sum(a*b for a,b in zip(query_vector, vec))
cosine_sim = dot_product / (query_norm * vec_norm)
distance = 1.0 - cosine_sim
redis_similarity = 1.0 - (distance/2.0)
similarities.append((name, redis_similarity))
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:k]
def matches_filter(self, attributes, filter_expr):
"""Filter matching for verification - uses Python eval to handle complex expressions"""
if attributes is None:
return False # No attributes or invalid JSON
# Replace JSON path selectors with Python dictionary access
py_expr = filter_expr
# Handle `.field` notation (replace with attributes['field'])
i = 0
while i < len(py_expr):
if py_expr[i] == '.' and (i == 0 or not py_expr[i-1].isalnum()):
# Find the end of the selector (stops at operators or whitespace)
j = i + 1
while j < len(py_expr) and (py_expr[j].isalnum() or py_expr[j] == '_'):
j += 1
if j > i + 1: # Found a valid selector
field = py_expr[i+1:j]
# Use a safe access pattern that returns a default value based on context
py_expr = py_expr[:i] + f"attributes.get('{field}')" + py_expr[j:]
i = i + len(f"attributes.get('{field}')")
else:
i += 1
else:
i += 1
# Convert not operator if needed
py_expr = py_expr.replace('!', ' not ')
try:
# Custom evaluation that handles exceptions for missing fields
# by returning False for the entire expression
# Split the expression on logical operators
parts = []
for op in [' and ', ' or ']:
if op in py_expr:
parts = py_expr.split(op)
break
if not parts: # No logical operators found
parts = [py_expr]
# Try to evaluate each part - if any part fails,
# the whole expression should fail
try:
result = eval(py_expr, {"attributes": attributes})
return bool(result)
except (TypeError, AttributeError):
# This typically happens when trying to compare None with
# numbers or other types, or when an attribute doesn't exist
return False
except Exception as e:
print(f"Error evaluating filter expression '{filter_expr}' as '{py_expr}': {e}")
return False
except Exception as e:
print(f"Error evaluating filter expression '{filter_expr}' as '{py_expr}': {e}")
return False
def safe_decode(self,item):
return item.decode() if isinstance(item, bytes) else item
def calculate_recall(self, redis_results, linear_results, k=10):
"""Calculate recall (percentage of correct results retrieved)"""
redis_set = set(self.safe_decode(item) for item in redis_results)
linear_set = set(item[0] for item in linear_results[:k])
if not linear_set:
return 1.0 # If no linear results, consider it perfect recall
intersection = redis_set.intersection(linear_set)
return len(intersection) / len(linear_set)
def test_recall_with_filter(self, filter_expr, ef=500, filter_ef=None):
"""Test recall for a given filter expression"""
# Create query vector
query_vec = generate_random_vector(self.dim)
# First, get ground truth using linear scan
linear_results = self.filter_linear_search(
self.vectors, self.names, query_vec, filter_expr, self.attribute_map, k=50)
# Calculate true selectivity from ground truth
true_selectivity = len(linear_results) / len(self.names) if self.names else 0
# Perform Redis search with filter
cmd_args = ['VSIM', self.test_key, 'VALUES', self.dim]
cmd_args.extend([str(x) for x in query_vec])
cmd_args.extend(['COUNT', 50, 'WITHSCORES', 'EF', ef, 'FILTER', filter_expr])
if filter_ef:
cmd_args.extend(['FILTER-EF', filter_ef])
start_time = time.time()
redis_results = self.redis.execute_command(*cmd_args)
query_time = time.time() - start_time
# Convert Redis results to dict
redis_items = {}
for i in range(0, len(redis_results), 2):
key = redis_results[i].decode() if isinstance(redis_results[i], bytes) else redis_results[i]
score = float(redis_results[i+1])
redis_items[key] = score
# Calculate metrics
recall = self.calculate_recall(redis_items.keys(), linear_results)
selectivity = len(redis_items) / len(self.names) if redis_items else 0
# Compare against the true selectivity from linear scan
assert abs(selectivity - true_selectivity) < 0.1, \
f"Redis selectivity {selectivity:.3f} differs significantly from ground truth {true_selectivity:.3f}"
# We expect high recall for standard parameters
if ef >= 500 and (filter_ef is None or filter_ef >= 1000):
try:
assert recall >= 0.7, \
f"Low recall {recall:.2f} for filter '{filter_expr}'"
except AssertionError as e:
# Get items found in each set
redis_items_set = set(redis_items.keys())
linear_items_set = set(item[0] for item in linear_results)
# Find items in each set
only_in_redis = redis_items_set - linear_items_set
only_in_linear = linear_items_set - redis_items_set
in_both = redis_items_set & linear_items_set
# Build comprehensive debug message
debug = f"\nGround Truth: {len(linear_results)} matching items (total vectors: {len(self.vectors)})"
debug += f"\nRedis Found: {len(redis_items)} items with FILTER-EF: {filter_ef or 'default'}"
debug += f"\nItems in both sets: {len(in_both)} (recall: {recall:.4f})"
debug += f"\nItems only in Redis: {len(only_in_redis)}"
debug += f"\nItems only in Ground Truth: {len(only_in_linear)}"
# Show some example items from each set with their scores
if only_in_redis:
debug += "\n\nTOP 5 ITEMS ONLY IN REDIS:"
sorted_redis = sorted([(k, v) for k, v in redis_items.items()], key=lambda x: x[1], reverse=True)
for i, (item, score) in enumerate(sorted_redis[:5]):
if item in only_in_redis:
debug += f"\n {i+1}. {item} (Score: {score:.4f})"
# Show attribute that should match filter
attr = self.attribute_map.get(item)
if attr:
debug += f" - Attrs: {attr.get('category', 'N/A')}, Price: {attr.get('price', 'N/A')}"
if only_in_linear:
debug += "\n\nTOP 5 ITEMS ONLY IN GROUND TRUTH:"
for i, (item, score) in enumerate(linear_results[:5]):
if item in only_in_linear:
debug += f"\n {i+1}. {item} (Score: {score:.4f})"
# Show attribute that should match filter
attr = self.attribute_map.get(item)
if attr:
debug += f" - Attrs: {attr.get('category', 'N/A')}, Price: {attr.get('price', 'N/A')}"
# Help identify parsing issues
debug += "\n\nPARSING CHECK:"
debug += f"\nRedis command: VSIM {self.test_key} VALUES {self.dim} [...] FILTER '{filter_expr}'"
# Check for WITHSCORES handling issues
if len(redis_results) > 0 and len(redis_results) % 2 == 0:
debug += f"\nRedis returned {len(redis_results)} items (looks like item,score pairs)"
debug += f"\nFirst few results: {redis_results[:4]}"
# Check the filter implementation
debug += "\n\nFILTER IMPLEMENTATION CHECK:"
debug += f"\nFilter expression: '{filter_expr}'"
debug += "\nSample attribute matches from attribute_map:"
count_matching = 0
for i, (name, attrs) in enumerate(self.attribute_map.items()):
if attrs and self.matches_filter(attrs, filter_expr):
count_matching += 1
if i < 3: # Show first 3 matches
debug += f"\n - {name}: {attrs}"
debug += f"\nTotal items matching filter in attribute_map: {count_matching}"
# Check if results array handling could be wrong
debug += "\n\nRESULT ARRAYS CHECK:"
if len(linear_results) >= 1:
debug += f"\nlinear_results[0]: {linear_results[0]}"
if isinstance(linear_results[0], tuple) and len(linear_results[0]) == 2:
debug += " (correct tuple format: (name, score))"
else:
debug += " (UNEXPECTED FORMAT!)"
# Debug sort order
debug += "\n\nSORTING CHECK:"
if len(linear_results) >= 2:
debug += f"\nGround truth first item score: {linear_results[0][1]}"
debug += f"\nGround truth second item score: {linear_results[1][1]}"
debug += f"\nCorrectly sorted by similarity? {linear_results[0][1] >= linear_results[1][1]}"
# Re-raise with detailed information
raise AssertionError(str(e) + debug)
return recall, selectivity, query_time, len(redis_items)
def test(self):
print(f"\nRunning comprehensive VSIM FILTER tests...")
# Create a larger dataset for testing
print(f"Creating dataset with {self.count} vectors and attributes...")
self.vectors, self.names, self.attribute_map = self.create_vectors_with_attributes(
self.test_key, self.count)
# ==== 1. Recall and Precision Testing ====
print("Testing recall for various filters...")
# Test basic filters with different selectivity
results = {}
results["category"] = self.test_recall_with_filter('.category == "electronics"')
results["price_high"] = self.test_recall_with_filter('.price > 1000')
results["in_stock"] = self.test_recall_with_filter('.in_stock')
results["rating"] = self.test_recall_with_filter('.rating >= 4')
results["complex1"] = self.test_recall_with_filter('.category == "electronics" and .price < 500')
print("Filter | Recall | Selectivity | Time (ms) | Results")
print("----------------------------------------------------")
for name, (recall, selectivity, time_ms, count) in results.items():
print(f"{name:7} | {recall:.3f} | {selectivity:.3f} | {time_ms*1000:.1f} | {count}")
# ==== 2. Filter Selectivity Performance ====
print("\nTesting filter selectivity performance...")
# High selectivity (very few matches)
high_sel_recall, _, high_sel_time, _ = self.test_recall_with_filter('.is_premium')
# Medium selectivity
med_sel_recall, _, med_sel_time, _ = self.test_recall_with_filter('.price > 100 and .price < 1000')
# Low selectivity (many matches)
low_sel_recall, _, low_sel_time, _ = self.test_recall_with_filter('.year > 2000')
print(f"High selectivity recall: {high_sel_recall:.3f}, time: {high_sel_time*1000:.1f}ms")
print(f"Med selectivity recall: {med_sel_recall:.3f}, time: {med_sel_time*1000:.1f}ms")
print(f"Low selectivity recall: {low_sel_recall:.3f}, time: {low_sel_time*1000:.1f}ms")
# ==== 3. FILTER-EF Parameter Testing ====
print("\nTesting FILTER-EF parameter...")
# Test with different FILTER-EF values
filter_expr = '.category == "electronics" and .price > 200'
ef_values = [100, 500, 2000, 5000]
print("FILTER-EF | Recall | Time (ms)")
print("-----------------------------")
for filter_ef in ef_values:
recall, _, query_time, _ = self.test_recall_with_filter(
filter_expr, ef=500, filter_ef=filter_ef)
print(f"{filter_ef:9} | {recall:.3f} | {query_time*1000:.1f}")
# Assert that higher FILTER-EF generally gives better recall
low_ef_recall, _, _, _ = self.test_recall_with_filter(filter_expr, filter_ef=100)
high_ef_recall, _, _, _ = self.test_recall_with_filter(filter_expr, filter_ef=5000)
# This might not always be true due to randomness, but generally holds
# We use a softer assertion to avoid flaky tests
assert high_ef_recall >= low_ef_recall * 0.8, \
f"Higher FILTER-EF should generally give better recall: {high_ef_recall:.3f} vs {low_ef_recall:.3f}"
# ==== 4. Complex Filter Expressions ====
print("\nTesting complex filter expressions...")
# Test a variety of complex expressions
complex_filters = [
'.price > 100 and (.category == "electronics" or .category == "furniture")',
'(.rating > 4 and .in_stock) or (.price < 50 and .views > 1000)',
'.category in ["electronics", "clothing"] and .price > 200 and .rating >= 3',
'(.category == "electronics" and .subcategory == "phones") or (.category == "furniture" and .price > 1000)',
'.year > 2010 and !(.price < 100) and .in_stock'
]
print("Expression | Results | Time (ms)")
print("-----------------------------")
for i, expr in enumerate(complex_filters):
try:
_, _, query_time, result_count = self.test_recall_with_filter(expr)
print(f"Complex {i+1} | {result_count:7} | {query_time*1000:.1f}")
except Exception as e:
print(f"Complex {i+1} | Error: {str(e)}")
# ==== 5. Attribute Type Testing ====
print("\nTesting different attribute types...")
type_filters = [
('.price > 500', "Numeric"),
('.category == "books"', "String equality"),
('.in_stock', "Boolean"),
('.tags in ["sale", "new"]', "Array membership"),
('.rating * 2 > 8', "Arithmetic")
]
for expr, type_name in type_filters:
try:
_, _, query_time, result_count = self.test_recall_with_filter(expr)
print(f"{type_name:16} | {expr:30} | {result_count:5} results | {query_time*1000:.1f}ms")
except Exception as e:
print(f"{type_name:16} | {expr:30} | Error: {str(e)}")
# ==== 6. Filter + Count Interaction ====
print("\nTesting COUNT parameter with filters...")
filter_expr = '.category == "electronics"'
counts = [5, 20, 100]
for count in counts:
query_vec = generate_random_vector(self.dim)
cmd_args = ['VSIM', self.test_key, 'VALUES', self.dim]
cmd_args.extend([str(x) for x in query_vec])
cmd_args.extend(['COUNT', count, 'WITHSCORES', 'FILTER', filter_expr])
results = self.redis.execute_command(*cmd_args)
result_count = len(results) // 2 # Divide by 2 because WITHSCORES returns pairs
# We expect result count to be at most the requested count
assert result_count <= count, f"Got {result_count} results with COUNT {count}"
print(f"COUNT {count:3} | Got {result_count:3} results")
# ==== 7. Edge Cases ====
print("\nTesting edge cases...")
# Test with no matching items
no_match_expr = '.category == "nonexistent_category"'
results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', self.dim,
*[str(x) for x in generate_random_vector(self.dim)],
'FILTER', no_match_expr)
assert len(results) == 0, f"Expected 0 results for non-matching filter, got {len(results)}"
print(f"No matching items: {len(results)} results (expected 0)")
# Test with invalid filter syntax
try:
self.redis.execute_command('VSIM', self.test_key, 'VALUES', self.dim,
*[str(x) for x in generate_random_vector(self.dim)],
'FILTER', '.category === "books"') # Triple equals is invalid
assert False, "Expected error for invalid filter syntax"
except:
print("Invalid filter syntax correctly raised an error")
# Test with extremely long complex expression
long_expr = ' and '.join([f'.rating > {i/10}' for i in range(10)])
try:
results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', self.dim,
*[str(x) for x in generate_random_vector(self.dim)],
'FILTER', long_expr)
print(f"Long expression: {len(results)} results")
except Exception as e:
print(f"Long expression error: {str(e)}")
print("\nComprehensive VSIM FILTER tests completed successfully")
class VSIMFilterSelectivityTest(TestCase):
def getname(self):
return "VSIM FILTER selectivity performance benchmark"
def estimated_runtime(self):
return 8 # This test might take up to 8 seconds
def setup(self):
super().setup()
self.dim = 32
self.count = 10000
self.test_key = f"{self.test_key}:selectivity" # Use a different key
def create_vector_with_age_attribute(self, name, age):
"""Create a vector with a specific age attribute"""
vec = generate_random_vector(self.dim)
vec_bytes = struct.pack(f'{self.dim}f', *vec)
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes, name)
self.redis.execute_command('VSETATTR', self.test_key, name, json.dumps({"age": age}))
def test(self):
print("\nRunning VSIM FILTER selectivity benchmark...")
# Create a dataset where we control the exact selectivity
print(f"Creating controlled dataset with {self.count} vectors...")
# Create vectors with age attributes from 1 to 100
for i in range(self.count):
age = (i % 100) + 1 # Ages from 1 to 100
name = f"{self.test_key}:item:{i}"
self.create_vector_with_age_attribute(name, age)
# Create a query vector
query_vec = generate_random_vector(self.dim)
# Test filters with different selectivities
selectivities = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.99]
results = []
print("\nSelectivity | Filter | Results | Time (ms)")
print("--------------------------------------------------")
for target_selectivity in selectivities:
# Calculate age threshold for desired selectivity
# For example, age <= 10 gives 10% selectivity
age_threshold = int(target_selectivity * 100)
filter_expr = f'.age <= {age_threshold}'
# Run query and measure time
start_time = time.time()
cmd_args = ['VSIM', self.test_key, 'VALUES', self.dim]
cmd_args.extend([str(x) for x in query_vec])
cmd_args.extend(['COUNT', 100, 'FILTER', filter_expr])
results = self.redis.execute_command(*cmd_args)
query_time = time.time() - start_time
actual_selectivity = len(results) / min(100, int(target_selectivity * self.count))
print(f"{target_selectivity:.2f} | {filter_expr:15} | {len(results):7} | {query_time*1000:.1f}")
# Add assertion to ensure reasonable performance for different selectivities
# For very selective queries (1%), we might need more exploration
if target_selectivity <= 0.05:
# For very selective queries, ensure we can find some results
assert len(results) > 0, f"No results found for {filter_expr}"
else:
# For less selective queries, performance should be reasonable
assert query_time < 1.0, f"Query too slow: {query_time:.3f}s for {filter_expr}"
print("\nSelectivity benchmark completed successfully")
class VSIMFilterComparisonTest(TestCase):
def getname(self):
return "VSIM FILTER EF parameter comparison"
def estimated_runtime(self):
return 8 # This test might take up to 8 seconds
def setup(self):
super().setup()
self.dim = 32
self.count = 5000
self.test_key = f"{self.test_key}:efparams" # Use a different key
def create_dataset(self):
"""Create a dataset with specific attribute patterns for testing FILTER-EF"""
vectors = []
names = []
# Create vectors with category and quality score attributes
for i in range(self.count):
vec = generate_random_vector(self.dim)
name = f"{self.test_key}:item:{i}"
# Add vector to Redis
vec_bytes = struct.pack(f'{self.dim}f', *vec)
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes, name)
# Create attributes - we want a very selective filter
# Only 2% of items have category=premium AND quality>90
category = "premium" if random.random() < 0.1 else random.choice(["standard", "economy", "basic"])
quality = random.randint(1, 100)
attrs = {
"id": i,
"category": category,
"quality": quality
}
self.redis.execute_command('VSETATTR', self.test_key, name, json.dumps(attrs))
vectors.append(vec)
names.append(name)
return vectors, names
def test(self):
print("\nRunning VSIM FILTER-EF parameter comparison...")
# Create dataset
vectors, names = self.create_dataset()
# Create a selective filter that matches ~2% of items
filter_expr = '.category == "premium" and .quality > 90'
# Create query vector
query_vec = generate_random_vector(self.dim)
# Test different FILTER-EF values
ef_values = [50, 100, 500, 1000, 5000]
results = []
print("\nFILTER-EF | Results | Time (ms) | Notes")
print("---------------------------------------")
baseline_count = None
for ef in ef_values:
# Run query and measure time
start_time = time.time()
cmd_args = ['VSIM', self.test_key, 'VALUES', self.dim]
cmd_args.extend([str(x) for x in query_vec])
cmd_args.extend(['COUNT', 100, 'FILTER', filter_expr, 'FILTER-EF', ef])
query_results = self.redis.execute_command(*cmd_args)
query_time = time.time() - start_time
# Set baseline for comparison
if baseline_count is None:
baseline_count = len(query_results)
recall_rate = len(query_results) / max(1, baseline_count) if baseline_count > 0 else 1.0
notes = ""
if ef == 5000:
notes = "Baseline"
elif recall_rate < 0.5:
notes = "Low recall!"
print(f"{ef:9} | {len(query_results):7} | {query_time*1000:.1f} | {notes}")
results.append((ef, len(query_results), query_time))
# If we have enough results at highest EF, check that recall improves with higher EF
if results[-1][1] >= 5: # At least 5 results for highest EF
# Extract result counts
result_counts = [r[1] for r in results]
# The last result (highest EF) should typically find more results than the first (lowest EF)
# but we use a soft assertion to avoid flaky tests
assert result_counts[-1] >= result_counts[0], \
f"Higher FILTER-EF should find at least as many results: {result_counts[-1]} vs {result_counts[0]}"
print("\nFILTER-EF parameter comparison completed successfully")

View file

@ -0,0 +1,56 @@
from test import TestCase, fill_redis_with_vectors, generate_random_vector
import random
class LargeScale(TestCase):
def getname(self):
return "Large Scale Comparison"
def estimated_runtime(self):
return 10
def test(self):
dim = 300
count = 20000
k = 50
# Fill Redis and get reference data for comparison
random.seed(42) # Make test deterministic
data = fill_redis_with_vectors(self.redis, self.test_key, count, dim)
# Generate query vector
query_vec = generate_random_vector(dim)
# Get results from Redis with good exploration factor
redis_raw = self.redis.execute_command('VSIM', self.test_key, 'VALUES', dim,
*[str(x) for x in query_vec],
'COUNT', k, 'WITHSCORES', 'EF', 500)
# Convert Redis results to dict
redis_results = {}
for i in range(0, len(redis_raw), 2):
key = redis_raw[i].decode()
score = float(redis_raw[i+1])
redis_results[key] = score
# Get results from linear scan
linear_results = data.find_k_nearest(query_vec, k)
linear_items = {name: score for name, score in linear_results}
# Compare overlap
redis_set = set(redis_results.keys())
linear_set = set(linear_items.keys())
overlap = len(redis_set & linear_set)
# If test fails, print comparison for debugging
if overlap < k * 0.7:
data.print_comparison({'items': redis_results, 'query_vector': query_vec}, k)
assert overlap >= k * 0.7, \
f"Expected at least 70% overlap in top {k} results, got {overlap/k*100:.1f}%"
# Verify scores for common items
for item in redis_set & linear_set:
redis_score = redis_results[item]
linear_score = linear_items[item]
assert abs(redis_score - linear_score) < 0.01, \
f"Score mismatch for {item}: Redis={redis_score:.3f} Linear={linear_score:.3f}"

View file

@ -0,0 +1,36 @@
from test import TestCase, generate_random_vector
import struct
class MemoryUsageTest(TestCase):
def getname(self):
return "[regression] MEMORY USAGE with attributes"
def test(self):
# Generate random vectors
vec1 = generate_random_vector(4)
vec2 = generate_random_vector(4)
vec_bytes1 = struct.pack('4f', *vec1)
vec_bytes2 = struct.pack('4f', *vec2)
# Add vectors to the key, one with attribute, one without
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes1, f'{self.test_key}:item:1')
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes2, f'{self.test_key}:item:2', 'SETATTR', '{"color":"red"}')
# Get memory usage for the key
try:
memory_usage = self.redis.execute_command('MEMORY', 'USAGE', self.test_key)
# If we got here without exception, the command worked
assert memory_usage > 0, "MEMORY USAGE should return a positive value"
# Add more attributes to increase complexity
self.redis.execute_command('VSETATTR', self.test_key, f'{self.test_key}:item:1', '{"color":"blue","size":10}')
# Check memory usage again
new_memory_usage = self.redis.execute_command('MEMORY', 'USAGE', self.test_key)
assert new_memory_usage > 0, "MEMORY USAGE should still return a positive value after setting attributes"
# Memory usage should be higher after adding attributes
assert new_memory_usage > memory_usage, "Memory usage increase after adding attributes"
except Exception as e:
raise AssertionError(f"MEMORY USAGE command failed: {str(e)}")

View file

@ -0,0 +1,85 @@
from test import TestCase, generate_random_vector
import struct
import math
import random
class VectorUpdateAndClusters(TestCase):
def getname(self):
return "VADD vector update with cluster relocation"
def estimated_runtime(self):
return 2.0 # Should take around 2 seconds
def generate_cluster_vector(self, base_vec, noise=0.1):
"""Generate a vector that's similar to base_vec with some noise."""
vec = [x + random.gauss(0, noise) for x in base_vec]
# Normalize
norm = math.sqrt(sum(x*x for x in vec))
return [x/norm for x in vec]
def test(self):
dim = 128
vectors_per_cluster = 5000
# Create two very different base vectors for our clusters
cluster1_base = generate_random_vector(dim)
cluster2_base = [-x for x in cluster1_base] # Opposite direction
# Add vectors from first cluster
for i in range(vectors_per_cluster):
vec = self.generate_cluster_vector(cluster1_base)
vec_bytes = struct.pack(f'{dim}f', *vec)
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes,
f'{self.test_key}:cluster1:{i}')
# Add vectors from second cluster
for i in range(vectors_per_cluster):
vec = self.generate_cluster_vector(cluster2_base)
vec_bytes = struct.pack(f'{dim}f', *vec)
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes,
f'{self.test_key}:cluster2:{i}')
# Pick a test vector from cluster1
test_key = f'{self.test_key}:cluster1:0'
# Verify it's in cluster1 using VSIM
initial_vec = self.generate_cluster_vector(cluster1_base)
results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', dim,
*[str(x) for x in initial_vec],
'COUNT', 100, 'WITHSCORES')
# Count how many cluster1 items are in top results
cluster1_count = sum(1 for i in range(0, len(results), 2)
if b'cluster1' in results[i])
assert cluster1_count > 80, "Initial clustering check failed"
# Now update the test vector to be in cluster2
new_vec = self.generate_cluster_vector(cluster2_base, noise=0.05)
vec_bytes = struct.pack(f'{dim}f', *new_vec)
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes, test_key)
# Verify the embedding was actually updated using VEMB
emb_result = self.redis.execute_command('VEMB', self.test_key, test_key)
updated_vec = [float(x) for x in emb_result]
# Verify updated vector matches what we inserted
dot_product = sum(a*b for a,b in zip(updated_vec, new_vec))
similarity = dot_product / (math.sqrt(sum(x*x for x in updated_vec)) *
math.sqrt(sum(x*x for x in new_vec)))
assert similarity > 0.9, "Vector was not properly updated"
# Verify it's now in cluster2 using VSIM
results = self.redis.execute_command('VSIM', self.test_key, 'VALUES', dim,
*[str(x) for x in cluster2_base],
'COUNT', 100, 'WITHSCORES')
# Verify our updated vector is among top results
found = False
for i in range(0, len(results), 2):
if results[i].decode() == test_key:
found = True
similarity = float(results[i+1])
assert similarity > 0.80, f"Updated vector has low similarity: {similarity}"
break
assert found, "Updated vector not found in cluster2 proximity"

View file

@ -0,0 +1,83 @@
from test import TestCase, fill_redis_with_vectors, generate_random_vector
import random
class HNSWPersistence(TestCase):
def getname(self):
return "HNSW Persistence"
def estimated_runtime(self):
return 30
def _verify_results(self, key, dim, query_vec, reduced_dim=None):
"""Run a query and return results dict"""
k = 10
args = ['VSIM', key]
if reduced_dim:
args.extend(['VALUES', dim])
args.extend([str(x) for x in query_vec])
else:
args.extend(['VALUES', dim])
args.extend([str(x) for x in query_vec])
args.extend(['COUNT', k, 'WITHSCORES'])
results = self.redis.execute_command(*args)
results_dict = {}
for i in range(0, len(results), 2):
key = results[i].decode()
score = float(results[i+1])
results_dict[key] = score
return results_dict
def test(self):
# Setup dimensions
dim = 128
reduced_dim = 32
count = 5000
random.seed(42)
# Create two datasets - one normal and one with dimension reduction
normal_data = fill_redis_with_vectors(self.redis, f"{self.test_key}:normal", count, dim)
projected_data = fill_redis_with_vectors(self.redis, f"{self.test_key}:projected",
count, dim, reduced_dim)
# Generate query vectors we'll use before and after reload
query_vec_normal = generate_random_vector(dim)
query_vec_projected = generate_random_vector(dim)
# Get initial results for both sets
initial_normal = self._verify_results(f"{self.test_key}:normal",
dim, query_vec_normal)
initial_projected = self._verify_results(f"{self.test_key}:projected",
dim, query_vec_projected, reduced_dim)
# Force Redis to save and reload the dataset
self.redis.execute_command('DEBUG', 'RELOAD')
# Verify results after reload
reloaded_normal = self._verify_results(f"{self.test_key}:normal",
dim, query_vec_normal)
reloaded_projected = self._verify_results(f"{self.test_key}:projected",
dim, query_vec_projected, reduced_dim)
# Verify normal vectors results
assert len(initial_normal) == len(reloaded_normal), \
"Normal vectors: Result count mismatch before/after reload"
for key in initial_normal:
assert key in reloaded_normal, f"Normal vectors: Missing item after reload: {key}"
assert abs(initial_normal[key] - reloaded_normal[key]) < 0.0001, \
f"Normal vectors: Score mismatch for {key}: " + \
f"before={initial_normal[key]:.6f}, after={reloaded_normal[key]:.6f}"
# Verify projected vectors results
assert len(initial_projected) == len(reloaded_projected), \
"Projected vectors: Result count mismatch before/after reload"
for key in initial_projected:
assert key in reloaded_projected, \
f"Projected vectors: Missing item after reload: {key}"
assert abs(initial_projected[key] - reloaded_projected[key]) < 0.0001, \
f"Projected vectors: Score mismatch for {key}: " + \
f"before={initial_projected[key]:.6f}, after={reloaded_projected[key]:.6f}"

View file

@ -0,0 +1,71 @@
from test import TestCase, fill_redis_with_vectors, generate_random_vector
class Reduce(TestCase):
def getname(self):
return "Dimension Reduction"
def estimated_runtime(self):
return 0.2
def test(self):
original_dim = 100
reduced_dim = 80
count = 1000
k = 50 # Number of nearest neighbors to check
# Fill Redis with vectors using REDUCE and get reference data
data = fill_redis_with_vectors(self.redis, self.test_key, count, original_dim, reduced_dim)
# Verify dimension is reduced
dim = self.redis.execute_command('VDIM', self.test_key)
assert dim == reduced_dim, f"Expected dimension {reduced_dim}, got {dim}"
# Generate query vector and get nearest neighbors using Redis
query_vec = generate_random_vector(original_dim)
redis_raw = self.redis.execute_command('VSIM', self.test_key, 'VALUES',
original_dim, *[str(x) for x in query_vec],
'COUNT', k, 'WITHSCORES')
# Convert Redis results to dict
redis_results = {}
for i in range(0, len(redis_raw), 2):
key = redis_raw[i].decode()
score = float(redis_raw[i+1])
redis_results[key] = score
# Get results from linear scan with original vectors
linear_results = data.find_k_nearest(query_vec, k)
linear_items = {name: score for name, score in linear_results}
# Compare overlap between reduced and non-reduced results
redis_set = set(redis_results.keys())
linear_set = set(linear_items.keys())
overlap = len(redis_set & linear_set)
overlap_ratio = overlap / k
# With random projection, we expect some loss of accuracy but should
# maintain at least some similarity structure.
# Note that gaussian distribution is the worse with this test, so
# in real world practice, things will be better.
min_expected_overlap = 0.1 # At least 10% overlap in top-k
assert overlap_ratio >= min_expected_overlap, \
f"Dimension reduction lost too much structure. Only {overlap_ratio*100:.1f}% overlap in top {k}"
# For items that appear in both results, scores should be reasonably correlated
common_items = redis_set & linear_set
for item in common_items:
redis_score = redis_results[item]
linear_score = linear_items[item]
# Allow for some deviation due to dimensionality reduction
assert abs(redis_score - linear_score) < 0.2, \
f"Score mismatch too high for {item}: Redis={redis_score:.3f} Linear={linear_score:.3f}"
# If test fails, print comparison for debugging
if overlap_ratio < min_expected_overlap:
print("\nLow overlap in results. Details:")
print("\nTop results from linear scan (original vectors):")
for name, score in linear_results:
print(f"{name}: {score:.3f}")
print("\nTop results from Redis (reduced vectors):")
for item, score in sorted(redis_results.items(), key=lambda x: x[1], reverse=True):
print(f"{item}: {score:.3f}")

View file

@ -0,0 +1,92 @@
from test import TestCase, generate_random_vector
import struct
import random
import time
class ComprehensiveReplicationTest(TestCase):
def getname(self):
return "Comprehensive Replication Test with mixed operations"
def estimated_runtime(self):
# This test will take longer than the default 100ms
return 20.0 # 20 seconds estimate
def test(self):
# Setup replication between primary and replica
assert self.setup_replication(), "Failed to setup replication"
# Test parameters
num_vectors = 5000
vector_dim = 8
delete_probability = 0.1
cas_probability = 0.3
# Keep track of added items for potential deletion
added_items = []
# Add vectors and occasionally delete
for i in range(num_vectors):
# Generate a random vector
vec = generate_random_vector(vector_dim)
vec_bytes = struct.pack(f'{vector_dim}f', *vec)
item_name = f"{self.test_key}:item:{i}"
# Decide whether to use CAS or not
use_cas = random.random() < cas_probability
if use_cas and added_items:
# Get an existing item for CAS reference (if available)
cas_item = random.choice(added_items)
try:
# Add with CAS
result = self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes,
item_name, 'CAS')
# Only add to our list if actually added (CAS might fail)
if result == 1:
added_items.append(item_name)
except Exception as e:
print(f" CAS VADD failed: {e}")
else:
try:
# Add without CAS
result = self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes, item_name)
# Only add to our list if actually added
if result == 1:
added_items.append(item_name)
except Exception as e:
print(f" VADD failed: {e}")
# Randomly delete items (with 10% probability)
if random.random() < delete_probability and added_items:
try:
# Select a random item to delete
item_to_delete = random.choice(added_items)
# Delete the item using VREM (not VDEL)
self.redis.execute_command('VREM', self.test_key, item_to_delete)
# Remove from our list
added_items.remove(item_to_delete)
except Exception as e:
print(f" VREM failed: {e}")
# Allow time for replication to complete
time.sleep(2.0)
# Verify final VCARD matches
primary_card = self.redis.execute_command('VCARD', self.test_key)
replica_card = self.replica.execute_command('VCARD', self.test_key)
assert primary_card == replica_card, f"Final VCARD mismatch: primary={primary_card}, replica={replica_card}"
# Verify VDIM matches
primary_dim = self.redis.execute_command('VDIM', self.test_key)
replica_dim = self.replica.execute_command('VDIM', self.test_key)
assert primary_dim == replica_dim, f"VDIM mismatch: primary={primary_dim}, replica={replica_dim}"
# Verify digests match using DEBUG DIGEST
primary_digest = self.redis.execute_command('DEBUG', 'DIGEST-VALUE', self.test_key)
replica_digest = self.replica.execute_command('DEBUG', 'DIGEST-VALUE', self.test_key)
assert primary_digest == replica_digest, f"Digest mismatch: primary={primary_digest}, replica={replica_digest}"
# Print summary
print(f"\n Added and maintained {len(added_items)} vectors with dimension {vector_dim}")
print(f" Final vector count: {primary_card}")
print(f" Final digest: {primary_digest[0].decode()}")

View file

@ -0,0 +1,98 @@
from test import TestCase, generate_random_vector
import threading
import struct
import math
import time
import random
from typing import List, Dict
class ConcurrentCASTest(TestCase):
def getname(self):
return "Concurrent VADD with CAS"
def estimated_runtime(self):
return 1.5
def worker(self, vectors: List[List[float]], start_idx: int, end_idx: int,
dim: int, results: Dict[str, bool]):
"""Worker thread that adds a subset of vectors using VADD CAS"""
for i in range(start_idx, end_idx):
vec = vectors[i]
name = f"{self.test_key}:item:{i}"
vec_bytes = struct.pack(f'{dim}f', *vec)
# Try to add the vector with CAS
try:
result = self.redis.execute_command('VADD', self.test_key, 'FP32',
vec_bytes, name, 'CAS')
results[name] = (result == 1) # Store if it was actually added
except Exception as e:
results[name] = False
print(f"Error adding {name}: {e}")
def verify_vector_similarity(self, vec1: List[float], vec2: List[float]) -> float:
"""Calculate cosine similarity between two vectors"""
dot_product = sum(a*b for a,b in zip(vec1, vec2))
norm1 = math.sqrt(sum(x*x for x in vec1))
norm2 = math.sqrt(sum(x*x for x in vec2))
return dot_product / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0
def test(self):
# Test parameters
dim = 128
total_vectors = 5000
num_threads = 8
vectors_per_thread = total_vectors // num_threads
# Generate all vectors upfront
random.seed(42) # For reproducibility
vectors = [generate_random_vector(dim) for _ in range(total_vectors)]
# Prepare threads and results dictionary
threads = []
results = {} # Will store success/failure for each vector
# Launch threads
for i in range(num_threads):
start_idx = i * vectors_per_thread
end_idx = start_idx + vectors_per_thread if i < num_threads-1 else total_vectors
thread = threading.Thread(target=self.worker,
args=(vectors, start_idx, end_idx, dim, results))
threads.append(thread)
thread.start()
# Wait for all threads to complete
for thread in threads:
thread.join()
# Verify cardinality
card = self.redis.execute_command('VCARD', self.test_key)
assert card == total_vectors, \
f"Expected {total_vectors} elements, but found {card}"
# Verify each vector
num_verified = 0
for i in range(total_vectors):
name = f"{self.test_key}:item:{i}"
# Verify the item was successfully added
assert results[name], f"Vector {name} was not successfully added"
# Get the stored vector
stored_vec_raw = self.redis.execute_command('VEMB', self.test_key, name)
stored_vec = [float(x) for x in stored_vec_raw]
# Verify vector dimensions
assert len(stored_vec) == dim, \
f"Stored vector dimension mismatch for {name}: {len(stored_vec)} != {dim}"
# Calculate similarity with original vector
similarity = self.verify_vector_similarity(vectors[i], stored_vec)
assert similarity > 0.99, \
f"Low similarity ({similarity}) for {name}"
num_verified += 1
# Final verification
assert num_verified == total_vectors, \
f"Only verified {num_verified} out of {total_vectors} vectors"

View file

@ -0,0 +1,41 @@
from test import TestCase
import struct
import math
class VEMB(TestCase):
def getname(self):
return "VEMB Command"
def test(self):
dim = 4
# Add same vector in both formats
vec = [1, 0, 0, 0]
norm = math.sqrt(sum(x*x for x in vec))
vec = [x/norm for x in vec] # Normalize the vector
# Add using FP32
vec_bytes = struct.pack(f'{dim}f', *vec)
self.redis.execute_command('VADD', self.test_key, 'FP32', vec_bytes, f'{self.test_key}:item:1')
# Add using VALUES
self.redis.execute_command('VADD', self.test_key, 'VALUES', dim,
*[str(x) for x in vec], f'{self.test_key}:item:2')
# Get both back with VEMB
result1 = self.redis.execute_command('VEMB', self.test_key, f'{self.test_key}:item:1')
result2 = self.redis.execute_command('VEMB', self.test_key, f'{self.test_key}:item:2')
retrieved_vec1 = [float(x) for x in result1]
retrieved_vec2 = [float(x) for x in result2]
# Compare both vectors with original (allow for small quantization errors)
for i in range(dim):
assert abs(vec[i] - retrieved_vec1[i]) < 0.01, \
f"FP32 vector component {i} mismatch: expected {vec[i]}, got {retrieved_vec1[i]}"
assert abs(vec[i] - retrieved_vec2[i]) < 0.01, \
f"VALUES vector component {i} mismatch: expected {vec[i]}, got {retrieved_vec2[i]}"
# Test non-existent item
result = self.redis.execute_command('VEMB', self.test_key, 'nonexistent')
assert result is None, "Non-existent item should return nil"

View file

@ -0,0 +1,55 @@
from test import TestCase, generate_random_vector, fill_redis_with_vectors
import struct
class VRANDMEMBERTest(TestCase):
def getname(self):
return "VRANDMEMBER basic functionality"
def test(self):
# Test with empty key
result = self.redis.execute_command('VRANDMEMBER', self.test_key)
assert result is None, "VRANDMEMBER on non-existent key should return NULL"
result = self.redis.execute_command('VRANDMEMBER', self.test_key, 5)
assert isinstance(result, list) and len(result) == 0, "VRANDMEMBER with count on non-existent key should return empty array"
# Fill with vectors
dim = 4
count = 100
data = fill_redis_with_vectors(self.redis, self.test_key, count, dim)
# Test single random member
result = self.redis.execute_command('VRANDMEMBER', self.test_key)
assert result is not None, "VRANDMEMBER should return a random member"
assert result.decode() in data.names, "Random member should be in the set"
# Test multiple unique members (positive count)
positive_count = 10
result = self.redis.execute_command('VRANDMEMBER', self.test_key, positive_count)
assert isinstance(result, list), "VRANDMEMBER with positive count should return an array"
assert len(result) == positive_count, f"Should return {positive_count} members"
# Check for uniqueness
decoded_results = [r.decode() for r in result]
assert len(decoded_results) == len(set(decoded_results)), "Results should be unique with positive count"
for item in decoded_results:
assert item in data.names, "All returned items should be in the set"
# Test more members than in the set
result = self.redis.execute_command('VRANDMEMBER', self.test_key, count + 10)
assert len(result) == count, "Should return only the available members when asking for more than exist"
# Test with duplicates (negative count)
negative_count = -20
result = self.redis.execute_command('VRANDMEMBER', self.test_key, negative_count)
assert isinstance(result, list), "VRANDMEMBER with negative count should return an array"
assert len(result) == abs(negative_count), f"Should return {abs(negative_count)} members"
# Check that all returned elements are valid
decoded_results = [r.decode() for r in result]
for item in decoded_results:
assert item in data.names, "All returned items should be in the set"
# Test with count = 0 (edge case)
result = self.redis.execute_command('VRANDMEMBER', self.test_key, 0)
assert isinstance(result, list) and len(result) == 0, "VRANDMEMBER with count=0 should return empty array"

1974
modules/vector-sets/vset.c Normal file

File diff suppressed because it is too large Load diff

510
modules/vector-sets/w2v.c Normal file
View file

@ -0,0 +1,510 @@
/*
* HNSW (Hierarchical Navigable Small World) Implementation
* Based on the paper by Yu. A. Malkov, D. A. Yashunin
*
* Copyright(C) 2024-Present, Redis Ltd. All Rights Reserved.
* Originally authored by: Salvatore Sanfilippo
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <time.h>
#include <stdint.h>
#include <pthread.h>
#include <stdatomic.h>
#include <math.h>
#include "hnsw.h"
/* Get current time in milliseconds */
uint64_t ms_time(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return (uint64_t)tv.tv_sec * 1000 + (tv.tv_usec / 1000);
}
/* Implementation of the recall test with random vectors. */
void test_recall(HNSW *index, int ef) {
const int num_test_vectors = 10000;
const int k = 100; // Number of nearest neighbors to find.
if (ef < k) ef = k;
// Add recall distribution counters (2% bins from 0-100%).
int recall_bins[50] = {0};
// Create array to store vectors for mixing.
int num_source_vectors = 1000; // Enough, since we mix them.
float **source_vectors = malloc(sizeof(float*) * num_source_vectors);
if (!source_vectors) {
printf("Failed to allocate memory for source vectors\n");
return;
}
// Allocate memory for each source vector.
for (int i = 0; i < num_source_vectors; i++) {
source_vectors[i] = malloc(sizeof(float) * 300);
if (!source_vectors[i]) {
printf("Failed to allocate memory for source vector %d\n", i);
// Clean up already allocated vectors.
for (int j = 0; j < i; j++) free(source_vectors[j]);
free(source_vectors);
return;
}
}
/* Populate source vectors from the index, we just scan the
* first N items. */
int source_count = 0;
hnswNode *current = index->head;
while (current && source_count < num_source_vectors) {
hnsw_get_node_vector(index, current, source_vectors[source_count]);
source_count++;
current = current->next;
}
if (source_count < num_source_vectors) {
printf("Warning: Only found %d nodes for source vectors\n",
source_count);
num_source_vectors = source_count;
}
// Allocate memory for test vector.
float *test_vector = malloc(sizeof(float) * 300);
if (!test_vector) {
printf("Failed to allocate memory for test vector\n");
for (int i = 0; i < num_source_vectors; i++) {
free(source_vectors[i]);
}
free(source_vectors);
return;
}
// Allocate memory for results.
hnswNode **hnsw_results = malloc(sizeof(hnswNode*) * ef);
hnswNode **linear_results = malloc(sizeof(hnswNode*) * ef);
float *hnsw_distances = malloc(sizeof(float) * ef);
float *linear_distances = malloc(sizeof(float) * ef);
if (!hnsw_results || !linear_results || !hnsw_distances || !linear_distances) {
printf("Failed to allocate memory for results\n");
if (hnsw_results) free(hnsw_results);
if (linear_results) free(linear_results);
if (hnsw_distances) free(hnsw_distances);
if (linear_distances) free(linear_distances);
for (int i = 0; i < num_source_vectors; i++) free(source_vectors[i]);
free(source_vectors);
free(test_vector);
return;
}
// Initialize random seed.
srand(time(NULL));
// Perform recall test.
printf("\nPerforming recall test with EF=%d on %d random vectors...\n",
ef, num_test_vectors);
double total_recall = 0.0;
for (int t = 0; t < num_test_vectors; t++) {
// Create a random vector by mixing 3 existing vectors.
float weights[3] = {0.0};
int src_indices[3] = {0};
// Generate random weights.
float weight_sum = 0.0;
for (int i = 0; i < 3; i++) {
weights[i] = (float)rand() / RAND_MAX;
weight_sum += weights[i];
src_indices[i] = rand() % num_source_vectors;
}
// Normalize weights.
for (int i = 0; i < 3; i++) weights[i] /= weight_sum;
// Mix vectors.
memset(test_vector, 0, sizeof(float) * 300);
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 300; j++) {
test_vector[j] +=
weights[i] * source_vectors[src_indices[i]][j];
}
}
// Perform HNSW search with the specified EF parameter.
int slot = hnsw_acquire_read_slot(index);
int hnsw_found = hnsw_search(index, test_vector, ef, hnsw_results, hnsw_distances, slot, 0);
// Perform linear search (ground truth).
int linear_found = hnsw_ground_truth_with_filter(index, test_vector, ef, linear_results, linear_distances, slot, 0, NULL, NULL);
hnsw_release_read_slot(index, slot);
// Calculate recall for this query (intersection size / k).
if (hnsw_found > k) hnsw_found = k;
if (linear_found > k) linear_found = k;
int intersection_count = 0;
for (int i = 0; i < linear_found; i++) {
for (int j = 0; j < hnsw_found; j++) {
if (linear_results[i] == hnsw_results[j]) {
intersection_count++;
break;
}
}
}
double recall = (double)intersection_count / linear_found;
total_recall += recall;
// Add to distribution bins (2% steps)
int bin_index = (int)(recall * 50);
if (bin_index >= 50) bin_index = 49; // Handle 100% recall case
recall_bins[bin_index]++;
// Show progress.
if ((t+1) % 1000 == 0 || t == num_test_vectors-1) {
printf("Processed %d/%d queries, current avg recall: %.2f%%\n",
t+1, num_test_vectors, (total_recall / (t+1)) * 100);
}
}
// Calculate and print final average recall.
double avg_recall = (total_recall / num_test_vectors) * 100;
printf("\nRecall Test Results:\n");
printf("Average recall@%d (EF=%d): %.2f%%\n", k, ef, avg_recall);
// Print recall distribution histogram.
printf("\nRecall Distribution (2%% bins):\n");
printf("================================\n");
// Find the maximum bin count for scaling.
int max_count = 0;
for (int i = 0; i < 50; i++) {
if (recall_bins[i] > max_count) max_count = recall_bins[i];
}
// Scale factor for histogram (max 50 chars wide)
const int max_bars = 50;
double scale = (max_count > max_bars) ? (double)max_bars / max_count : 1.0;
// Print the histogram.
for (int i = 0; i < 50; i++) {
int bar_len = (int)(recall_bins[i] * scale);
printf("%3d%%-%-3d%% | %-6d |", i*2, (i+1)*2, recall_bins[i]);
for (int j = 0; j < bar_len; j++) printf("#");
printf("\n");
}
// Cleanup.
free(hnsw_results);
free(linear_results);
free(hnsw_distances);
free(linear_distances);
free(test_vector);
for (int i = 0; i < num_source_vectors; i++) free(source_vectors[i]);
free(source_vectors);
}
/* Example usage in main() */
int w2v_single_thread(int m_param, int quantization, uint64_t numele, int massdel, int self_recall, int recall_ef) {
/* Create index */
HNSW *index = hnsw_new(300, quantization, m_param);
float v[300];
uint16_t wlen;
FILE *fp = fopen("word2vec.bin","rb");
if (fp == NULL) {
perror("word2vec.bin file missing");
exit(1);
}
unsigned char header[8];
fread(header,8,1,fp); // Skip header
uint64_t id = 0;
uint64_t start_time = ms_time();
char *word = NULL;
hnswNode *search_node = NULL;
while(id < numele) {
if (fread(&wlen,2,1,fp) == 0) break;
word = malloc(wlen+1);
fread(word,wlen,1,fp);
word[wlen] = 0;
fread(v,300*sizeof(float),1,fp);
// Plain API that acquires a write lock for the whole time.
hnswNode *added = hnsw_insert(index, v, NULL, 0, id++, word, 200);
if (!strcmp(word,"banana")) search_node = added;
if (!(id % 10000)) printf("%llu added\n", (unsigned long long)id);
}
uint64_t elapsed = ms_time() - start_time;
fclose(fp);
printf("%llu words added (%llu words/sec), last word: %s\n",
(unsigned long long)index->node_count,
(unsigned long long)id*1000/elapsed, word);
/* Search query */
if (search_node == NULL) search_node = index->head;
hnsw_get_node_vector(index,search_node,v);
hnswNode *neighbors[10];
float distances[10];
int found, j;
start_time = ms_time();
for (j = 0; j < 20000; j++)
found = hnsw_search(index, v, 10, neighbors, distances, 0, 0);
elapsed = ms_time() - start_time;
printf("%d searches performed (%llu searches/sec), nodes found: %d\n",
j, (unsigned long long)j*1000/elapsed, found);
if (found > 0) {
printf("Found %d neighbors:\n", found);
for (int i = 0; i < found; i++) {
printf("Node ID: %llu, distance: %f, word: %s\n",
(unsigned long long)neighbors[i]->id,
distances[i], (char*)neighbors[i]->value);
}
}
// Self-recall test (ability to find the node by its own vector).
if (self_recall) {
hnsw_print_stats(index);
hnsw_test_graph_recall(index,200,0);
}
// Recall test with random vectors.
if (recall_ef > 0) {
test_recall(index, recall_ef);
}
uint64_t connected_nodes;
int reciprocal_links;
hnsw_validate_graph(index, &connected_nodes, &reciprocal_links);
if (massdel) {
int remove_perc = 95;
printf("\nRemoving %d%% of nodes...\n", remove_perc);
uint64_t initial_nodes = index->node_count;
hnswNode *current = index->head;
while (current && index->node_count > initial_nodes*(100-remove_perc)/100) {
hnswNode *next = current->next;
hnsw_delete_node(index,current,free);
current = next;
// In order to don't remove only contiguous nodes, from time
// skip a node.
if (current && !(random() % remove_perc)) current = current->next;
}
printf("%llu nodes left\n", (unsigned long long)index->node_count);
// Test again.
hnsw_validate_graph(index, &connected_nodes, &reciprocal_links);
hnsw_test_graph_recall(index,200,0);
}
hnsw_free(index,free);
return 0;
}
struct threadContext {
pthread_mutex_t FileAccessMutex;
uint64_t numele;
_Atomic uint64_t SearchesDone;
_Atomic uint64_t id;
FILE *fp;
HNSW *index;
float *search_vector;
};
// Note that in practical terms inserting with many concurrent threads
// may be *slower* and not faster, because there is a lot of
// contention. So this is more a robustness test than anything else.
//
// The optimistic commit API goal is actually to exploit the ability to
// add faster when there are many concurrent reads.
void *threaded_insert(void *ctxptr) {
struct threadContext *ctx = ctxptr;
char *word;
float v[300];
uint16_t wlen;
while(1) {
pthread_mutex_lock(&ctx->FileAccessMutex);
if (fread(&wlen,2,1,ctx->fp) == 0) break;
pthread_mutex_unlock(&ctx->FileAccessMutex);
word = malloc(wlen+1);
fread(word,wlen,1,ctx->fp);
word[wlen] = 0;
fread(v,300*sizeof(float),1,ctx->fp);
// Check-and-set API that performs the costly scan for similar
// nodes concurrently with other read threads, and finally
// applies the check if the graph wasn't modified.
InsertContext *ic;
uint64_t next_id = ctx->id++;
ic = hnsw_prepare_insert(ctx->index, v, NULL, 0, next_id, 200);
if (hnsw_try_commit_insert(ctx->index, ic, word) == NULL) {
// This time try locking since the start.
hnsw_insert(ctx->index, v, NULL, 0, next_id, word, 200);
}
if (next_id >= ctx->numele) break;
if (!((next_id+1) % 10000))
printf("%llu added\n", (unsigned long long)next_id+1);
}
return NULL;
}
void *threaded_search(void *ctxptr) {
struct threadContext *ctx = ctxptr;
/* Search query */
hnswNode *neighbors[10];
float distances[10];
int found = 0;
uint64_t last_id = 0;
while(ctx->id < 1000000) {
int slot = hnsw_acquire_read_slot(ctx->index);
found = hnsw_search(ctx->index, ctx->search_vector, 10, neighbors, distances, slot, 0);
hnsw_release_read_slot(ctx->index,slot);
last_id = ++ctx->id;
}
if (found > 0 && last_id == 1000000) {
printf("Found %d neighbors:\n", found);
for (int i = 0; i < found; i++) {
printf("Node ID: %llu, distance: %f, word: %s\n",
(unsigned long long)neighbors[i]->id,
distances[i], (char*)neighbors[i]->value);
}
}
return NULL;
}
int w2v_multi_thread(int m_param, int numthreads, int quantization, uint64_t numele) {
/* Create index */
struct threadContext ctx;
ctx.index = hnsw_new(300, quantization, m_param);
ctx.fp = fopen("word2vec.bin","rb");
if (ctx.fp == NULL) {
perror("word2vec.bin file missing");
exit(1);
}
unsigned char header[8];
fread(header,8,1,ctx.fp); // Skip header
pthread_mutex_init(&ctx.FileAccessMutex,NULL);
uint64_t start_time = ms_time();
ctx.id = 0;
ctx.numele = numele;
pthread_t threads[numthreads];
for (int j = 0; j < numthreads; j++)
pthread_create(&threads[j], NULL, threaded_insert, &ctx);
// Wait for all the threads to terminate adding items.
for (int j = 0; j < numthreads; j++)
pthread_join(threads[j],NULL);
uint64_t elapsed = ms_time() - start_time;
fclose(ctx.fp);
// Obtain the last word.
hnswNode *node = ctx.index->head;
char *word = node->value;
// We will search this last inserted word in the next test.
// Let's save its embedding.
ctx.search_vector = malloc(sizeof(float)*300);
hnsw_get_node_vector(ctx.index,node,ctx.search_vector);
printf("%llu words added (%llu words/sec), last word: %s\n",
(unsigned long long)ctx.index->node_count,
(unsigned long long)ctx.id*1000/elapsed, word);
/* Search query */
start_time = ms_time();
ctx.id = 0; // We will use this atomic field to stop at N queries done.
for (int j = 0; j < numthreads; j++)
pthread_create(&threads[j], NULL, threaded_search, &ctx);
// Wait for all the threads to terminate searching.
for (int j = 0; j < numthreads; j++)
pthread_join(threads[j],NULL);
elapsed = ms_time() - start_time;
printf("%llu searches performed (%llu searches/sec)\n",
(unsigned long long)ctx.id,
(unsigned long long)ctx.id*1000/elapsed);
hnsw_print_stats(ctx.index);
uint64_t connected_nodes;
int reciprocal_links;
hnsw_validate_graph(ctx.index, &connected_nodes, &reciprocal_links);
printf("%llu connected nodes. Links all reciprocal: %d\n",
(unsigned long long)connected_nodes, reciprocal_links);
hnsw_free(ctx.index,free);
return 0;
}
int main(int argc, char **argv) {
int quantization = HNSW_QUANT_NONE;
int numthreads = 0;
uint64_t numele = 20000;
int m_param = 0; // Default value (0 means use HNSW_DEFAULT_M)
/* This you can enable in single thread mode for testing: */
int massdel = 0; // If true, does the mass deletion test.
int self_recall = 0; // If true, does the self-recall test.
int recall_ef = 0; // If not 0, does the recall test with this EF value.
for (int j = 1; j < argc; j++) {
int moreargs = argc-j-1;
if (!strcasecmp(argv[j],"--quant")) {
quantization = HNSW_QUANT_Q8;
} else if (!strcasecmp(argv[j],"--bin")) {
quantization = HNSW_QUANT_BIN;
} else if (!strcasecmp(argv[j],"--mass-del")) {
massdel = 1;
} else if (!strcasecmp(argv[j],"--self-recall")) {
self_recall = 1;
} else if (moreargs >= 1 && !strcasecmp(argv[j],"--recall")) {
recall_ef = atoi(argv[j+1]);
j++;
} else if (moreargs >= 1 && !strcasecmp(argv[j],"--threads")) {
numthreads = atoi(argv[j+1]);
j++;
} else if (moreargs >= 1 && !strcasecmp(argv[j],"--numele")) {
numele = strtoll(argv[j+1],NULL,0);
j++;
if (numele < 1) numele = 1;
} else if (moreargs >= 1 && !strcasecmp(argv[j],"--m")) {
m_param = atoi(argv[j+1]);
j++;
} else if (!strcasecmp(argv[j],"--help")) {
printf("%s [--quant] [--bin] [--thread <count>] [--numele <count>] [--m <count>] [--mass-del] [--self-recall] [--recall <ef>]\n", argv[0]);
exit(0);
} else {
printf("Unrecognized option or wrong number of arguments: %s\n", argv[j]);
exit(1);
}
}
if (quantization == HNSW_QUANT_NONE) {
printf("You can enable quantization with --quant\n");
}
if (numthreads > 0) {
w2v_multi_thread(m_param, numthreads, quantization, numele);
} else {
printf("Single thread execution. Use --threads 4 for concurrent API\n");
w2v_single_thread(m_param, quantization, numele, massdel, self_recall, recall_ef);
}
}