Skip to content

Commit eff8b4f

Browse files
committed
feat: Add index-level STOPWORDS configuration support
Add support for configuring stopwords at index creation time via IndexInfo.stopwords field. - Add stopwords field to IndexInfo class (None/[]/custom list) - Update SearchIndex.create() and AsyncSearchIndex.create() to pass stopwords - Update convert_index_info_to_schema() to parse stopwords from FT.INFO - Update cluster_create_index() functions to accept stopwords parameter - Add warning when using query-time stopwords with index-level STOPWORDS 0 - Add comprehensive documentation in 11_advanced_queries.ipynb - Create stopwords_interaction_guide.md explaining best practices
1 parent 94e2ff2 commit eff8b4f

File tree

7 files changed

+1055
-68
lines changed

7 files changed

+1055
-68
lines changed

docs/user_guide/11_advanced_queries.ipynb

Lines changed: 485 additions & 61 deletions
Large diffs are not rendered by default.

redisvl/index/index.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
BaseVectorQuery,
8080
CountQuery,
8181
FilterQuery,
82+
TextQuery,
8283
)
8384
from redisvl.query.filter import FilterExpression
8485
from redisvl.redis.connection import (
@@ -248,6 +249,30 @@ def _validate_query(self, query: BaseQuery) -> None:
248249
"Vector field using 'flat' algorithm does not support EF_RUNTIME query parameter."
249250
)
250251

252+
# Warn if using query-time stopwords with index-level STOPWORDS 0
253+
if isinstance(query, TextQuery):
254+
index_stopwords = self.schema.index.stopwords
255+
query_stopwords = query.stopwords
256+
257+
# Check if index has STOPWORDS 0 (empty list) and query has stopwords configured
258+
# Note: query.stopwords is a set, and when stopwords=None is passed to TextQuery,
259+
# it becomes an empty set. So we check if the set is non-empty.
260+
if (
261+
index_stopwords is not None
262+
and len(index_stopwords) == 0
263+
and len(query_stopwords) > 0
264+
):
265+
warnings.warn(
266+
"Query-time stopwords are configured but the index has STOPWORDS 0 (stopwords = []). "
267+
"This is counterproductive: all words including common words like 'of', 'the', 'a' are indexed, "
268+
"but your query-time stopwords will filter them from the search query. "
269+
"This makes your search less precise than it could be. "
270+
"Consider setting stopwords=None in TextQuery to search for all indexed words. "
271+
"See docs/stopwords_interaction_guide.md for more information.",
272+
UserWarning,
273+
stacklevel=3,
274+
)
275+
251276
@property
252277
def name(self) -> str:
253278
"""The name of the Redis search index."""
@@ -601,17 +626,22 @@ def create(self, overwrite: bool = False, drop: bool = False) -> None:
601626
definition = IndexDefinition(
602627
prefix=[self.schema.index.prefix], index_type=self._storage.type
603628
)
629+
# Extract stopwords from schema
630+
stopwords = self.schema.index.stopwords
631+
604632
if isinstance(self._redis_client, RedisCluster):
605633
cluster_create_index(
606634
index_name=self.name,
607635
client=self._redis_client,
608636
fields=redis_fields,
609637
definition=definition,
638+
stopwords=stopwords,
610639
)
611640
else:
612641
self._redis_client.ft(self.name).create_index(
613642
fields=redis_fields,
614643
definition=definition,
644+
stopwords=stopwords,
615645
)
616646
except redis.exceptions.RedisError as e:
617647
raise RedisSearchError(
@@ -1384,17 +1414,22 @@ async def create(self, overwrite: bool = False, drop: bool = False) -> None:
13841414
definition = IndexDefinition(
13851415
prefix=[self.schema.index.prefix], index_type=self._storage.type
13861416
)
1417+
# Extract stopwords from schema
1418+
stopwords = self.schema.index.stopwords
1419+
13871420
if isinstance(client, AsyncRedisCluster):
13881421
await async_cluster_create_index(
13891422
index_name=self.schema.index.name,
13901423
client=client,
13911424
fields=redis_fields,
13921425
definition=definition,
1426+
stopwords=stopwords,
13931427
)
13941428
else:
13951429
await client.ft(self.schema.index.name).create_index(
13961430
fields=redis_fields,
13971431
definition=definition,
1432+
stopwords=stopwords,
13981433
)
13991434
except redis.exceptions.RedisError as e:
14001435
raise RedisSearchError(

redisvl/redis/connection.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,17 @@ def convert_index_info_to_schema(index_info: Dict[str, Any]) -> Dict[str, Any]:
204204
prefixes = prefixes[0]
205205
storage_type = index_info["index_definition"][1].lower()
206206

207+
# Parse stopwords if present in FT.INFO output
208+
# stopwords_list is only present when explicitly set (STOPWORDS 0 or custom list)
209+
# If not present, we use None to indicate default Redis behavior
210+
stopwords = None
211+
if "stopwords_list" in index_info:
212+
# Convert bytes to strings if needed
213+
stopwords_list = index_info["stopwords_list"]
214+
stopwords = [
215+
sw.decode("utf-8") if isinstance(sw, bytes) else sw for sw in stopwords_list
216+
]
217+
207218
index_fields = index_info["attributes"]
208219

209220
def parse_vector_attrs(attrs):
@@ -411,8 +422,12 @@ def parse_attrs(attrs, field_type=None):
411422
# append field
412423
schema_fields.append(field)
413424

425+
index_dict = {"name": index_name, "prefix": prefixes, "storage_type": storage_type}
426+
if stopwords is not None:
427+
index_dict["stopwords"] = stopwords
428+
414429
return {
415-
"index": {"name": index_name, "prefix": prefixes, "storage_type": storage_type},
430+
"index": index_dict,
416431
"fields": schema_fields,
417432
}
418433

redisvl/schema/schema.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
import re
21
from collections.abc import Mapping, Sequence
32
from enum import Enum
43
from pathlib import Path
5-
from typing import Any, Dict, List, Literal, Union
4+
from typing import Any, Dict, List, Literal, Optional, Union
65

76
import yaml
87
from pydantic import BaseModel, Field, model_validator
@@ -31,7 +30,7 @@ class StorageType(Enum):
3130

3231
class IndexInfo(BaseModel):
3332
"""Index info includes the essential details regarding index settings,
34-
such as its name, prefix, key separator, and storage type in Redis.
33+
such as its name, prefix, key separator, storage type, and stopwords in Redis.
3534
3635
In yaml format, the index info section looks like:
3736
@@ -42,6 +41,7 @@ class IndexInfo(BaseModel):
4241
prefix: user
4342
key_separtor: ':'
4443
storage_type: json
44+
stopwords: [] # Disable stopwords (STOPWORDS 0)
4545
4646
In dict format, the index info section looks like:
4747
@@ -51,7 +51,8 @@ class IndexInfo(BaseModel):
5151
"name": "user-index",
5252
"prefix": "user",
5353
"key_separator": ":",
54-
"storage_type": "json"
54+
"storage_type": "json",
55+
"stopwords": ["the", "a", "an"] # Custom stopwords
5556
}}
5657
5758
"""
@@ -64,6 +65,9 @@ class IndexInfo(BaseModel):
6465
"""The separator character used in designing Redis keys."""
6566
storage_type: StorageType = StorageType.HASH
6667
"""The storage type used in Redis (e.g., 'hash' or 'json')."""
68+
stopwords: Optional[List[str]] = None
69+
"""Index-level stopwords configuration. None (default) uses Redis default stopwords,
70+
empty list [] disables stopwords (STOPWORDS 0), or provide a custom list of stopwords."""
6771

6872

6973
class IndexSchema(BaseModel):

0 commit comments

Comments
 (0)