@@ -115,7 +115,29 @@ def __init__(self,
115
115
path = self .chroma_db_path ,
116
116
settings = Settings (allow_reset = False )
117
117
)
118
- self .collection = self .client .get_collection (self .collection_name )
118
+
119
+ # Initialize CLIP model for consistent embeddings with the database
120
+ logger .info ("Loading CLIP model for embedding compatibility..." )
121
+ self .clip_model = SentenceTransformer ('clip-ViT-B-32' )
122
+ logger .info ("CLIP model loaded successfully" )
123
+
124
+ # Create CLIP embedding function to match the one used during data creation
125
+ from chromadb .utils import embedding_functions
126
+
127
+ class CLIPEmbeddingFunction (embedding_functions .EmbeddingFunction ):
128
+ def __init__ (self , clip_model ):
129
+ self .clip_model = clip_model
130
+
131
+ def __call__ (self , input ):
132
+ """Generate embeddings using CLIP model"""
133
+ embeddings = []
134
+ for text in input :
135
+ embedding = self .clip_model .encode (text )
136
+ embeddings .append (embedding .tolist ())
137
+ return embeddings
138
+
139
+ self .clip_ef = CLIPEmbeddingFunction (self .clip_model )
140
+ self .collection = self .client .get_collection (self .collection_name , embedding_function = self .clip_ef )
119
141
120
142
# Initialize Ollama client
121
143
self .ollama_client = ollama .Client (host = self .ollama_url )
@@ -139,12 +161,18 @@ def _cosine_similarity(self, query_embedding: np.ndarray, doc_embeddings: np.nda
139
161
return np .dot (doc_norms , query_norm )
140
162
141
163
def search (self , query : str , top_k : int = 10 ) -> List [Dict [str , Any ]]:
142
- results = self .collection .query (query_texts = [query ], n_results = top_k )
164
+ # Search only text documents (filter out image documents for now)
165
+ results = self .collection .query (
166
+ query_texts = [query ],
167
+ n_results = top_k ,
168
+ where = {"content_type" : "text" } # Only search text documents
169
+ )
143
170
docs = [
144
171
{
145
172
"content" : doc ,
146
173
"title" : meta ["title" ],
147
- "distance" : dist
174
+ "distance" : dist ,
175
+ "content_type" : meta .get ("content_type" , "text" )
148
176
}
149
177
for doc , meta , dist in zip (results ["documents" ][0 ], results ["metadatas" ][0 ], results ["distances" ][0 ])
150
178
]
0 commit comments