fix(recall): reject empty queries with 400 and fix SQL parameter gap (#632)

nicoloboschi · web-flow · commit 5cdc714a387a · 2026-03-21T20:24:36.000+01:00
* fix(recall): reject empty queries with 400 and fix SQL parameter gap causing IndeterminateDatatypeError

When query text contains only punctuation/symbols (no word characters after
normalization), the BM25 arms are skipped but the old code still placed `limit`
at \$3 in the params list. If tags or tag_groups were also set, their params
(\$4+) were referenced in the SQL while \$3 was a gap, causing PostgreSQL to
raise IndeterminateDatatypeError.

Fix the parameter layout so `limit` is only appended to params when tokens are
present (i.e. when BM25 arms actually use LIMIT \$3), and shift tags_param_idx
from 4 to 3 in the no-tokens path.

Also add a field_validator on RecallRequest.query that rejects empty-after-
normalization queries at the API layer with a 400 before they reach the DB.

* refactor: extract tokenize_query helper and reuse in RecallRequest validator
diff --git a/hindsight-api-slim/hindsight_api/api/http.py b/hindsight-api-slim/hindsight_api/api/http.py
@@ -169,6 +169,15 @@ class RecallRequest(BaseModel):
         "Each group is a leaf {tags, match} or compound {and: [...]}, {or: [...]}, {not: ...}.",
     )
 
+    @field_validator("query")
+    @classmethod
+    def validate_query_not_empty(cls, v: str) -> str:
+        from ..engine.search.retrieval import tokenize_query
+
+        if not tokenize_query(v):
+            raise ValueError("query must contain at least one word character after normalization")
+        return v
+
     @model_validator(mode="after")
     def validate_tags_exclusive(self) -> "RecallRequest":
         if self.tags is not None and self.tag_groups is not None:
diff --git a/hindsight-api-slim/hindsight_api/engine/search/retrieval.py b/hindsight-api-slim/hindsight_api/engine/search/retrieval.py
@@ -10,6 +10,7 @@
 
 import asyncio
 import logging
+import re
 from dataclasses import dataclass, field
 from datetime import UTC, datetime
 from typing import Optional
@@ -26,6 +27,15 @@
 logger = logging.getLogger(__name__)
 
 
+def tokenize_query(query_text: str) -> list[str]:
+    """Normalize query text and split into BM25 tokens.
+
+    Strips punctuation, lowercases, and splits on whitespace.
+    Returns an empty list when the query contains no word characters.
+    """
+    return re.sub(r"[^\w\s]", " ", query_text.lower()).split()
+
+
 @dataclass
 class ParallelRetrievalResult:
     """Result from parallel retrieval across all methods."""
@@ -129,12 +139,9 @@ async def retrieve_semantic_bm25_combined(
     Returns:
         Dict mapping fact_type -> (semantic_results, bm25_results)
     """
-    import re
-
     result_dict: dict[str, tuple[list[RetrievalResult], list[RetrievalResult]]] = {ft: ([], []) for ft in fact_types}
 
-    sanitized_text = re.sub(r"[^\w\s]", " ", query_text.lower())
-    tokens = [token for token in sanitized_text.split() if token]
+    tokens = tokenize_query(query_text)
 
     # Over-fetch for HNSW approximation; semantic results trimmed to limit in Python.
     hnsw_fetch = max(limit * 5, 100)
@@ -148,11 +155,15 @@ async def retrieve_semantic_bm25_combined(
     # --- Parameter layout ---
     # $1 = query_emb_str  (semantic arms)
     # $2 = bank_id
-    # $3 = limit          (BM25 LIMIT; semantic uses inlined hnsw_fetch literal)
-    # $4 = bm25_text      (only when tokens present)
-    # $N = tags           (N=4 when no tokens, N=5 when tokens present)
-    # $M+ = tag_groups params (one per leaf, starting after tags param)
-    tags_param_idx = 5 if tokens else 4
+    # When tokens present:
+    #   $3 = limit          (BM25 LIMIT; semantic uses inlined hnsw_fetch literal)
+    #   $4 = bm25_text
+    #   $5 = tags           (if present)
+    #   $6+ = tag_groups params (one per leaf)
+    # When no tokens ($3 is skipped — not included in params to avoid type inference gap):
+    #   $3 = tags           (if present)
+    #   $4+ = tag_groups params (one per leaf)
+    tags_param_idx = 5 if tokens else 3
     tags_clause = build_tags_where_clause_simple(tags, tags_param_idx, match=tags_match)
 
     # tag_groups params start immediately after the tags param slot
@@ -222,9 +233,10 @@ async def retrieve_semantic_bm25_combined(
 
     query = "\nUNION ALL\n".join(arms)
 
-    params: list = [query_emb_str, bank_id, limit]
+    params: list = [query_emb_str, bank_id]
     if tokens:
-        params.append(bm25_text_param)
+        params.append(limit)  # $3: BM25 LIMIT (only referenced when tokens are present)
+        params.append(bm25_text_param)  # $4
     if tags:
         params.append(tags)
     params.extend(groups_params)