diff --git a/libs/community/langchain_community/vectorstores/falkordb_vector.py b/libs/community/langchain_community/vectorstores/falkordb_vector.py index c53975a9..d3f70be2 100644 --- a/libs/community/langchain_community/vectorstores/falkordb_vector.py +++ b/libs/community/langchain_community/vectorstores/falkordb_vector.py @@ -28,7 +28,7 @@ def generate_random_string(length: int) -> str: return random_string -DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE +DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.EUCLIDEAN_DISTANCE DISTANCE_MAPPING = { DistanceStrategy.EUCLIDEAN_DISTANCE: "euclidean", DistanceStrategy.COSINE: "cosine", @@ -384,7 +384,7 @@ def __init__( DistanceStrategy.COSINE, ]: raise ValueError( - "`distance_strategy` must be either 'EULIDEAN_DISTANCE` or `COSINE`" + "`distance_strategy` must be either `EUCLIDEAN_DISTANCE` or `COSINE`" ) # Graph object takes precedent over env or input params @@ -492,6 +492,9 @@ def retrieve_existing_node_index( Check if the vector index exists in the FalkorDB database and returns its embedding dimension, entity_type, entity_label, entity_property + + This version also validates the similarity_function against the configured + distance_strategy, so we don't silently reuse an index with the wrong distance metric. This method; 1. queries the FalkorDB database for existing indexes @@ -539,9 +542,20 @@ def retrieve_existing_node_index( entity_type = str(dict["entity_type"]) entity_label = str(dict["entity_label"]) entity_property = str(dict["entity_property"]) + similarity_function = dict.get("index_similarityFunction") break if embedding_dimension and entity_type and entity_label and entity_property: self._index_type = IndexType(entity_type) + desired_sim = DISTANCE_MAPPING[self._distance_strategy] + if similarity_function and similarity_function != desired_sim: + raise ValueError( + f"Existing index on {entity_label}.{entity_property} " + f"uses similarity_function='{similarity_function}', " + f"but requested distance_strategy is '{self._distance_strategy}' " + f"({desired_sim}). " + "Drop/recreate the index or change the distance_strategy." + ) + return embedding_dimension, entity_type, entity_label, entity_property else: return None, None, None, None @@ -729,7 +743,7 @@ def create_new_index_on_relationship( relation_type, embedding_node_property, dim=embedding_dimension, - similarity_function=DISTANCE_MAPPING[DEFAULT_DISTANCE_STRATEGY], + similarity_function=DISTANCE_MAPPING[self._distance_strategy], ) except Exception as e: if "already indexed" in str(e): @@ -949,6 +963,7 @@ def __from( metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, search_type: SearchType = SearchType.VECTOR, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, **kwargs: Any, ) -> FalkorDBVector: if ids is None: @@ -960,8 +975,10 @@ def __from( store = cls( embedding=embedding, search_type=search_type, + distance_strategy=distance_strategy, **kwargs, - ) + ) + # Check if the vector index already exists embedding_dimension, index_type, entity_label, entity_property = ( @@ -1139,6 +1156,7 @@ def from_existing_graph( *, search_type: SearchType = DEFAULT_SEARCH_TYPE, retrieval_query: str = "", + distance_strategy = DEFAULT_DISTANCE_STRATEGY, **kwargs: Any, ) -> FalkorDBVector: """ @@ -1198,6 +1216,7 @@ def from_existing_graph( retrieval_query=retrieval_query, node_label=node_label, embedding_node_property=embedding_node_property, + distance_strategy=distance_strategy, **kwargs, ) @@ -1444,15 +1463,19 @@ def similarity_search_with_score_by_vector( f"n.{self.embedding_node_property} IS NOT NULL AND " ) - base_cosine_query = ( + if self._distance_strategy == DistanceStrategy.COSINE: + base_distance_query = ( " WITH n as node, " - f" vec.cosineDistance(n.{self.embedding_node_property}" - ", vecf32($embedding)) as score " - ) - + f" vec.cosineDistance(n.{self.embedding_node_property}, vecf32($embedding)) as score " + ) + else: + base_distance_query = ( + " WITH n as node, " + f" vec.euclideanDistance(n.{self.embedding_node_property}, vecf32($embedding)) as score " + ) filter_snippets, filter_params = construct_metadata_filter(filter) - index_query = base_index_query + filter_snippets + base_cosine_query + index_query = base_index_query + filter_snippets + base_distance_query else: index_query = _get_search_index_query(self.search_type, self._index_type) filter_params = {} diff --git a/libs/community/tests/unit_tests/vectorstores/test_falkordb_distance_strategy.py b/libs/community/tests/unit_tests/vectorstores/test_falkordb_distance_strategy.py new file mode 100644 index 00000000..29340c96 --- /dev/null +++ b/libs/community/tests/unit_tests/vectorstores/test_falkordb_distance_strategy.py @@ -0,0 +1,148 @@ +""" +Unit tests for FalkorDBVector distance strategy handling. + +These tests verify that the FalkorDB LangChain wrapper respects the +configured distance strategy when creating relationship indexes, +propagates a custom distance strategy when instantiating from +documents, and builds the correct distance function into the metadata +filter search query. + +The tests use unittest.mock to avoid requiring a live FalkorDB +instance. They focus on the behaviour of the wrapper itself. +""" + +from typing import Any, List +from unittest.mock import MagicMock + +import pytest +from langchain_core.documents import Document + +from langchain_community.vectorstores.falkordb_vector import ( + FalkorDBVector, + IndexType, +) +from langchain_community.vectorstores.utils import DistanceStrategy + + +class DummyEmbeddings: + """A minimal embeddings implementation for testing. + + This class implements the methods expected by FalkorDBVector + but returns trivial fixed‑size vectors so that tests can run + without access to external embedding models. + """ + + def __init__(self, size: int = 2) -> None: + self.size = size + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + # Return a distinct vector for each document; dimension is ``self.size``. + return [[float(i + 1) for _ in range(self.size)] for i in range(len(texts))] + + def embed_query(self, text: str) -> List[float]: + # Return a simple vector of the correct dimension for any query. + return [1.0 for _ in range(self.size)] + + +def test_create_new_index_on_relationship_respects_strategy() -> None: + """Ensure that create_new_index_on_relationship uses the configured metric.""" + # Mock graph and database; create_edge_vector_index should record its kwargs. + fake_db = MagicMock() + fake_graph = MagicMock() + fake_graph._graph = fake_db + fake_graph._driver = MagicMock() + + # Instantiate a FalkorDBVector with cosine distance + store = FalkorDBVector( + embedding=DummyEmbeddings(), + graph=fake_graph, + relation_type="REL", + embedding_node_property="embedding", + embedding_dimension=2, + distance_strategy=DistanceStrategy.COSINE, + ) + + store.create_new_index_on_relationship() + # Verify that the underlying DB method was called with similarity_function="cosine" + assert fake_db.create_edge_vector_index.call_count == 1 + _, kwargs = fake_db.create_edge_vector_index.call_args + assert kwargs["similarity_function"] == "cosine" + + +def test_from_documents_propagates_distance_strategy() -> None: + """Ensure that from_documents forwards distance_strategy to the store.""" + fake_db = MagicMock() + fake_graph = MagicMock() + fake_graph._graph = fake_db + fake_graph._driver = MagicMock() + + docs = [Document(page_content="alpha"), Document(page_content="beta")] + store = FalkorDBVector.from_documents( + documents=docs, + embedding=DummyEmbeddings(), + graph=fake_graph, + embedding_dimension=2, + node_label="Test", + distance_strategy=DistanceStrategy.COSINE, + ) + + assert store._distance_strategy == DistanceStrategy.COSINE + + +def test_similarity_search_with_score_by_vector_uses_correct_distance() -> None: + """Ensure metadata-filtered vector search uses the correct distance function.""" + # Prepare a store with cosine distance + fake_db = MagicMock() + fake_graph = MagicMock() + fake_graph._graph = fake_db + fake_graph._driver = MagicMock() + + store = FalkorDBVector( + embedding=DummyEmbeddings(), + graph=fake_graph, + node_label="Chunk", + embedding_node_property="embedding", + embedding_dimension=2, + distance_strategy=DistanceStrategy.COSINE, + ) + # Manually set index type for query construction + store._index_type = IndexType.NODE + + captured: dict[str, Any] = {} + + def fake_query(query: str, params: Any = None) -> List[Any]: + captured["query"] = query + return [] + + # Patch the _query method to capture the query string + store._query = fake_query # type: ignore[assignment] + + # Perform a similarity search with a metadata filter; query should contain cosine distance + store.similarity_search_with_score_by_vector( + embedding=[0.1, 0.2], k=1, filter={"lang": "en"} + ) + assert "vec.cosineDistance" in captured["query"] + + # Repeat for Euclidean strategy + store2 = FalkorDBVector( + embedding=DummyEmbeddings(), + graph=fake_graph, + node_label="Chunk", + embedding_node_property="embedding", + embedding_dimension=2, + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + ) + store2._index_type = IndexType.NODE + + captured2: dict[str, Any] = {} + + def fake_query2(query: str, params: Any = None) -> List[Any]: + captured2["query"] = query + return [] + + store2._query = fake_query2 # type: ignore[assignment] + + store2.similarity_search_with_score_by_vector( + embedding=[0.3, 0.4], k=1, filter={"lang": "en"} + ) + assert "vec.euclideanDistance" in captured2["query"]