fix: resolve 25 test regressions from streaming retain pipeline (#722)

nicoloboschi · nicoloboschi · commit 46a5d9e50794 · 2026-04-01T17:22:53.000+02:00
The 3-phase retain pipeline (914ba79) introduced several regressions: 1. **Per-content tags lost** — streaming pipeline used `contents[0].tags` for ALL chunks, breaking tag-based visibility. Fixed by tracking chunk-to-content mapping so each chunk uses its source content's tags. 2. **Multi-document batches broken** — batches with per-content `document_id` values were merged into a single document. Fixed by grouping by document_id and processing each group independently. 3. **Migration ID collision** — `d6e7f8a9b0c1` was used by both `drop_documents_metadata` and `case_insensitive_entities_trgm_index`. Renamed trgm migration to `e8f9a0b1c2d3`, fixed chain, added missing schema prefix on DROP INDEX. 4. **Graph entity inheritance** — `get_graph_data` queried entities for observation IDs only, but observations inherit entities from source memories. Fixed by querying `all_relevant_ids`. 5. **Docstring false positives** — link_utils.py docstrings triggered the SQL schema safety test's unqualified table reference check. 6. **Config test count** — `retain_chunk_batch_size` added to `_CONFIGURABLE_FIELDS` without updating the test assertion.
diff --git a/hindsight-api-slim/hindsight_api/alembic/versions/a4b5c6d7e8f9_fix_per_bank_vector_index_type.py b/hindsight-api-slim/hindsight_api/alembic/versions/a4b5c6d7e8f9_fix_per_bank_vector_index_type.py
@@ -1,7 +1,7 @@
 """Fix per-bank vector indexes to match configured extension
 
 Revision ID: a4b5c6d7e8f9
-Revises: c2d3e4f5g6h7, c5d6e7f8a9b0
+Revises: e8f9a0b1c2d3
 Create Date: 2026-04-01
 
 Migration d5e6f7a8b9c0 hardcoded HNSW when creating per-bank partial vector
@@ -21,7 +21,7 @@
 from sqlalchemy import text
 
 revision: str = "a4b5c6d7e8f9"
-down_revision: str | Sequence[str] | None = "d6e7f8a9b0c1"
+down_revision: str | Sequence[str] | None = "e8f9a0b1c2d3"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 
diff --git a/hindsight-api-slim/hindsight_api/alembic/versions/e8f9a0b1c2d3_case_insensitive_entities_trgm_index.py b/hindsight-api-slim/hindsight_api/alembic/versions/e8f9a0b1c2d3_case_insensitive_entities_trgm_index.py
@@ -4,17 +4,17 @@
 "Alice" and "alice" to have different trigram sets. This recreates it on
 LOWER(canonical_name) so the % operator matches case-insensitively.
 
-Revision ID: d6e7f8a9b0c1
-Revises: c5d6e7f8a9b0
+Revision ID: e8f9a0b1c2d3
+Revises: d6e7f8a9b0c1
 Create Date: 2026-03-31
 """
 
 from collections.abc import Sequence
 
 from alembic import context, op
 
-revision: str = "d6e7f8a9b0c1"
-down_revision: str | Sequence[str] | None = "c5d6e7f8a9b0"
+revision: str = "e8f9a0b1c2d3"
+down_revision: str | Sequence[str] | None = "d6e7f8a9b0c1"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 
@@ -27,7 +27,7 @@ def _get_schema_prefix() -> str:
 def upgrade() -> None:
     schema = _get_schema_prefix()
     # Drop the old case-sensitive trigram index
-    op.execute(f"DROP INDEX IF EXISTS entities_canonical_name_trgm_idx")
+    op.execute(f"DROP INDEX IF EXISTS {schema}entities_canonical_name_trgm_idx")
     # Create case-insensitive trigram index on LOWER(canonical_name)
     op.execute(
         f"CREATE INDEX IF NOT EXISTS entities_canonical_name_lower_trgm_idx "
@@ -36,8 +36,8 @@ def upgrade() -> None:
 
 
 def downgrade() -> None:
-    op.execute(f"DROP INDEX IF EXISTS entities_canonical_name_lower_trgm_idx")
     schema = _get_schema_prefix()
+    op.execute(f"DROP INDEX IF EXISTS {schema}entities_canonical_name_lower_trgm_idx")
     # Restore original case-sensitive index
     op.execute(
         f"CREATE INDEX IF NOT EXISTS entities_canonical_name_trgm_idx "
diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py
@@ -4338,15 +4338,16 @@ async def get_graph_data(
                 link for link in links if link["from_unit_id"] in unit_id_set and link["to_unit_id"] in unit_id_set
             ]
 
-            # Get entity information — only for visible units
-            if unit_ids:
+            # Get entity information — for visible units AND their source memories
+            # (observations inherit entities from source memories)
+            if all_relevant_ids:
                 unit_entities = await conn.fetch(f"""
                     SELECT ue.unit_id, e.canonical_name
                     FROM {fq_table("unit_entities")} ue
                     JOIN {fq_table("entities")} e ON ue.entity_id = e.id
                     WHERE ue.unit_id = ANY($1::uuid[])
                     ORDER BY ue.unit_id
-                """, unit_ids)
+                """, all_relevant_ids)
             else:
                 unit_entities = []
 
diff --git a/hindsight-api-slim/hindsight_api/engine/retain/link_utils.py b/hindsight-api-slim/hindsight_api/engine/retain/link_utils.py
@@ -58,7 +58,7 @@ async def _bulk_insert_links(
     chunk_size: int = 5000,
     skip_exists_check: bool = False,
 ) -> None:
-    """Insert links into memory_links using sorted bulk INSERT FROM unnest().
+    """Bulk-insert links using sorted INSERT FROM unnest().
 
     Sorting by (from_unit_id, to_unit_id) ensures all concurrent transactions
     acquire index locks in the same order, eliminating circular-wait deadlocks.
@@ -944,7 +944,7 @@ async def create_semantic_links_batch(
 
 async def insert_entity_links_batch(conn, links: list[EntityLink], bank_id: str, chunk_size: int = 5000):
     """
-    Insert entity links into memory_links via sorted bulk INSERT FROM unnest().
+    Bulk-insert entity links via sorted INSERT FROM unnest().
 
     Args:
         conn: Database connection
diff --git a/hindsight-api-slim/hindsight_api/engine/retain/orchestrator.py b/hindsight-api-slim/hindsight_api/engine/retain/orchestrator.py
@@ -434,6 +434,53 @@ async def retain_batch(
     # Convert dicts to RetainContent objects
     contents = _build_contents(contents_dicts, document_tags)
 
+    # When contents have multiple distinct per-content document_ids and no
+    # batch-level document_id, group by doc_id and process each group
+    # independently so each document is tracked separately.
+    if not document_id:
+        per_content_doc_ids = [item.get("document_id") for item in contents_dicts]
+        unique_doc_ids = {d for d in per_content_doc_ids if d}
+        if len(unique_doc_ids) > 1:
+            # Group contents by document_id, preserving original order
+            groups: dict[str, tuple[list[RetainContentDict], list[RetainContent]]] = {}
+            original_indices: dict[str, list[int]] = {}
+            for idx, (cd, c) in enumerate(zip(contents_dicts, contents)):
+                doc_key = cd.get("document_id") or str(uuid.uuid4())
+                if doc_key not in groups:
+                    groups[doc_key] = ([], [])
+                    original_indices[doc_key] = []
+                groups[doc_key][0].append(cd)
+                groups[doc_key][1].append(c)
+                original_indices[doc_key].append(idx)
+
+            # Process each group and merge results back in original order
+            result_unit_ids: list[list[str]] = [[] for _ in contents_dicts]
+            total_usage = TokenUsage()
+            for doc_key, (group_dicts, group_contents) in groups.items():
+                group_ids, group_usage = await retain_batch(
+                    pool=pool,
+                    embeddings_model=embeddings_model,
+                    llm_config=llm_config,
+                    entity_resolver=entity_resolver,
+                    format_date_fn=format_date_fn,
+                    bank_id=bank_id,
+                    contents_dicts=group_dicts,
+                    config=config,
+                    document_id=doc_key,
+                    is_first_batch=is_first_batch,
+                    fact_type_override=fact_type_override,
+                    document_tags=document_tags,
+                    operation_id=operation_id,
+                    schema=schema,
+                    outbox_callback=outbox_callback,
+                    db_semaphore=db_semaphore,
+                )
+                for group_idx, orig_idx in enumerate(original_indices[doc_key]):
+                    if group_idx < len(group_ids):
+                        result_unit_ids[orig_idx] = group_ids[group_idx]
+                total_usage = total_usage + group_usage
+            return result_unit_ids, total_usage
+
     # Resolve effective document_id early so both delta and streaming paths
     # can find existing chunks from a prior attempt. On retry, the generated
     # document_id is recovered from operation result_metadata.
@@ -508,10 +555,12 @@ async def retain_batch(
     # retain code paths.
     chunk_batch_size = getattr(config, "retain_chunk_batch_size", 100)
     chunk_size = getattr(config, "retain_chunk_size", 3000)
-    all_pre_chunks = []
-    for content in contents:
+    all_pre_chunks: list[str] = []
+    chunk_to_content: list[int] = []  # maps chunk index -> index into contents
+    for content_idx, content in enumerate(contents):
         content_chunks = fact_extraction.chunk_text(content.content, chunk_size)
         all_pre_chunks.extend(content_chunks)
+        chunk_to_content.extend([content_idx] * len(content_chunks))
 
     total_pre_chunks = len(all_pre_chunks)
     num_batches = (total_pre_chunks + chunk_batch_size - 1) // chunk_batch_size if total_pre_chunks > 0 else 1
@@ -538,6 +587,7 @@ async def retain_batch(
         log_buffer=log_buffer,
         start_time=start_time,
         all_pre_chunks=all_pre_chunks,
+        chunk_to_content=chunk_to_content,
         chunk_batch_size=chunk_batch_size,
         operation_id=operation_id,
         schema=schema,
@@ -676,6 +726,7 @@ async def _streaming_retain_batch(
     log_buffer: list[str],
     start_time: float,
     all_pre_chunks: list[str],
+    chunk_to_content: list[int],
     chunk_batch_size: int,
     operation_id: str | None = None,
     schema: str | None = None,
@@ -704,8 +755,8 @@ async def _streaming_retain_batch(
     # operation result_metadata on retry).
     effective_doc_id = document_id
 
-    # Use the first content item as the template for metadata (context, event_date, etc.)
-    template_content = contents[0] if contents else RetainContent(content="")
+    # Default template for metadata (context, event_date, etc.) when content list is empty.
+    _default_content = RetainContent(content="")
 
     # Load existing chunk hashes BEFORE document tracking to detect recovery.
     # If chunks exist AND the document content hash matches, this is a retry of
@@ -774,14 +825,15 @@ async def _streaming_retain_batch(
     # it pushes the enriched result into the queue for the DB consumer.
     async def _llm_producer() -> None:
         async def _extract_one(global_idx: int, chunk_text: str) -> None:
+            source = contents[chunk_to_content[global_idx]] if contents else _default_content
             content = RetainContent(
                 content=chunk_text,
-                context=template_content.context,
-                event_date=template_content.event_date,
-                metadata=template_content.metadata,
-                entities=template_content.entities,
-                tags=template_content.tags,
-                observation_scopes=template_content.observation_scopes,
+                context=source.context,
+                event_date=source.event_date,
+                metadata=source.metadata,
+                entities=source.entities,
+                tags=source.tags,
+                observation_scopes=source.observation_scopes,
             )
             extracted, processed, chunk_meta, usage = await _extract_and_embed(
                 [content],
diff --git a/hindsight-api-slim/tests/test_hierarchical_config.py b/hindsight-api-slim/tests/test_hierarchical_config.py
@@ -95,9 +95,10 @@ async def test_hierarchical_fields_categorization():
     assert "reflect_source_facts_max_tokens" in configurable
     assert "llm_gemini_safety_settings" in configurable
     assert "mcp_enabled_tools" in configurable
+    assert "retain_chunk_batch_size" in configurable
 
     # Verify count is correct
-    assert len(configurable) == 21
+    assert len(configurable) == 22
 
     # Verify credential fields (NEVER exposed)
     assert "llm_api_key" in credentials