datacommonsorg
diff --git a/‎import-automation/workflow/ingestion-helper/README.md‎
Lines changed: 8 additions & 0 deletions b/‎import-automation/workflow/ingestion-helper/README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎import-automation/workflow/ingestion-helper/embedding_utils.py‎
Lines changed: 169 additions & 0 deletions b/‎import-automation/workflow/ingestion-helper/embedding_utils.py‎
Lines changed: 169 additions & 0 deletions
diff --git a/‎import-automation/workflow/ingestion-helper/embedding_utils_test.py‎
Lines changed: 166 additions & 0 deletions b/‎import-automation/workflow/ingestion-helper/embedding_utils_test.py‎
Lines changed: 166 additions & 0 deletions
diff --git a/‎import-automation/workflow/ingestion-helper/main.py‎
Lines changed: 23 additions & 0 deletions b/‎import-automation/workflow/ingestion-helper/main.py‎
Lines changed: 23 additions & 0 deletions
@@ -61,8 +61,16 @@ Updates the version of an import, records version history, and updates the statu
 Initializes the Spanner database by creating all necessary tables and uploading proto descriptors.
 
 *   This action requires no payload parameters. It automatically reads `schema.sql` and `storage.pb` from the container directory to provision the database schema and proto descriptors.
+*   `enableEmbeddings` (Optional): Boolean to enable creation of embedding tables and models.
 *   **Note on Protos**: The `storage.pb` file is generated during the Docker build process. The `Dockerfile` fetches `storage.proto` from the `datacommonsorg/import` GitHub repository and compiles it into `storage.pb`.
 
+#### `embedding_ingestion`
+Triggers the generation of embeddings for updated nodes in Spanner. It fetches nodes of specific types (e.g., `StatisticalVariable`, `Topic`) that have been updated, generates embeddings using a remote ML model in Spanner, and stores the results in the `NodeEmbeddings` table.
+
+*   `enableEmbeddings` (Optional): Boolean to override the default setting for enabling embeddings. If false or missing and default is false, it skips embedding generation.
+*   **Flags**:
+    -   `--node_types`: A comma-separated list of node types to process (default: `StatisticalVariable,Topic`). This is a command-line flag for the service, not a request parameter.
+
 ## Local Development and Testing
 
 To run the helper service locally and test its functionality:
 
@@ -0,0 +1,169 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper utilities for embedding workflows."""
+
+import itertools
+import logging
+import time
+from datetime import datetime
+from google.cloud.spanner_v1.param_types import TIMESTAMP, STRING, Array, Struct, StructField
+
+
+_BATCH_SIZE = 1000
+
+def get_latest_lock_timestamp(database):
+    """Gets the latest AcquiredTimestamp from IngestionLock table.
+    
+    Args:
+        database: google.cloud.spanner.Database object.
+        
+    Returns:
+        The latest AcquiredTimestamp as a datetime object, or None if no entries exist.
+    """
+    time_lock_sql = "SELECT MAX(AcquiredTimestamp) FROM IngestionLock"
+    try:
+        with database.snapshot() as snapshot:
+            results = snapshot.execute_sql(time_lock_sql)
+            for row in results:
+                return row[0]
+    except Exception as e:
+        logging.error(f"Error fetching latest lock timestamp: {e}")
+        raise
+    return None
+
+def get_updated_nodes(database, timestamp, node_types):
+    """Gets subject_ids and names from Node table where update_timestamp > timestamp.
+    Yields results to avoid loading all into memory.
+    
+    Args:
+        database: google.cloud.spanner.Database object.
+        timestamp: datetime object to filter by.
+        node_types: A list of strings representing the node types to filter by.
+        
+    Yields:
+        Dictionaries containing subject_id and name.
+    """
+    timestamp_condition = "update_timestamp > @timestamp" if timestamp else "TRUE"
+    
+    updated_node_sql = f"""
+        SELECT subject_id, name, types FROM Node 
+        WHERE name IS NOT NULL
+          AND {timestamp_condition}
+          AND EXISTS (
+            SELECT 1 FROM UNNEST(types) AS t WHERE t IN UNNEST(@node_types)
+          )
+    """
+    
+    params = {"node_types": node_types}
+    param_types = {"node_types": Array(STRING)}
+    
+    if timestamp:
+        logging.info(f"Filtering valid nodes updated after {timestamp}")
+        params["timestamp"] = timestamp
+        param_types["timestamp"] = TIMESTAMP
+    else:
+        logging.info("No timestamp provided, reading all valid nodes.")
+    
+    try:
+        with database.snapshot() as snapshot:
+            results = snapshot.execute_sql(updated_node_sql, params=params, param_types=param_types, timeout=300)
+            fields = None
+            for row in results:
+                if fields is None:
+                    fields = [field.name for field in results.fields]
+                yield dict(zip(fields, row))
+    except Exception as e:
+        logging.error(f"Error fetching updated nodes: {e}")
+        raise
+
+
+def filter_and_convert_nodes(nodes_generator):
+    """Filters out nodes without a name and converts dictionaries to tuples.
+    Reads from a generator and yields results.
+
+    Args:
+        nodes_generator: A generator yielding dictionaries containing subject_id, name, and types.
+
+    Yields:
+        Tuples (subject_id, embedding_content, types).
+    """
+    for node in nodes_generator:
+        if node.get("name"):
+            yield (node.get("subject_id"), node.get("name"), node.get("types"))
+
+
+def generate_embeddings_partitioned(database, nodes_generator):
+    """Generates embeddings in batches using standard transactions.
+    Processes nodes in chunks of 500 to avoid transaction size limits.
+    Accepts a generator to avoid loading all nodes into memory.
+    
+    Args:
+        database: google.cloud.spanner.Database object.
+        nodes_generator: A generator yielding tuples containing (subject_id, embedding_content).
+        
+    Returns:
+        The number of affected rows.
+    """
+    global _BATCH_SIZE
+    total_rows_affected = 0
+
+    logging.info(f"Generating embeddings in batches of {_BATCH_SIZE}.")
+
+    embeddings_sql = """
+        INSERT OR UPDATE INTO NodeEmbedding (subject_id, embedding_content, embeddings, types)
+        SELECT subject_id, content, embeddings.values, types
+        FROM ML.PREDICT(
+            MODEL NodeEmbeddingModel,
+            (SELECT subject_id, embedding_content AS content, types, "RETRIEVAL_QUERY" AS task_type FROM UNNEST(@nodes))
+        )
+    """
+
+    struct_type = Struct([
+        StructField("subject_id", STRING),
+        StructField("embedding_content", STRING),
+        StructField("types", Array(STRING))
+    ])
+
+    def chunked(iterable, n):
+        it = iter(iterable)
+        while True:
+            chunk = list(itertools.islice(it, n))
+            if not chunk:
+                break
+            yield chunk
+
+    for batch in chunked(nodes_generator, _BATCH_SIZE):
+        params = {"nodes": batch}
+        param_types = {"nodes": Array(struct_type)}
+
+        def _execute_dml(transaction):
+            return transaction.execute_update(embeddings_sql, params=params, param_types=param_types, timeout=300)
+
+        try:
+            row_count = database.run_in_transaction(_execute_dml)
+            total_rows_affected += row_count
+            logging.info(f"Processed batch of {len(batch)} nodes. Affected total {total_rows_affected} rows.")
+            time.sleep(0.5)
+        except Exception as e:
+            logging.error(f"Error executing batch transaction: {e}")
+            raise
+
+    logging.info(f"Completed batch processing. Total affected rows: {total_rows_affected}")
+    return total_rows_affected
+
+
+
+
+
@@ -0,0 +1,166 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest.mock import MagicMock, patch
+from datetime import datetime
+
+from embedding_utils import (
+    get_latest_lock_timestamp,
+    get_updated_nodes,
+    filter_and_convert_nodes,
+    generate_embeddings_partitioned
+)
+
+class TestEmbeddingUtils(unittest.TestCase):
+
+    def test_get_latest_lock_timestamp(self):
+        mock_database = MagicMock()
+        mock_snapshot = MagicMock()
+        mock_database.snapshot.return_value.__enter__.return_value = mock_snapshot
+        expected_timestamp = datetime(2026, 4, 20, 12, 0, 0)
+        mock_snapshot.execute_sql.return_value = [(expected_timestamp,)]
+
+        timestamp = get_latest_lock_timestamp(mock_database)
+        self.assertEqual(timestamp, expected_timestamp)
+
+    def test_get_updated_nodes(self):
+        mock_database = MagicMock()
+        mock_snapshot = MagicMock()
+        mock_database.snapshot.return_value.__enter__.return_value = mock_snapshot
+
+        class MockField:
+            def __init__(self, name):
+                self.name = name
+
+        class MockResults:
+            def __init__(self, rows, field_names):
+                self.rows = rows
+                self.fields = [MockField(name) for name in field_names]
+
+            def __iter__(self):
+                return iter(self.rows)
+
+        mock_snapshot.execute_sql.return_value = MockResults(
+            rows=[("dc/1", "Node 1", ["Topic"])],
+            field_names=["subject_id", "name", "types"]
+        )
+
+        nodes = list(get_updated_nodes(mock_database, None, ["Topic"]))
+        
+        # Verify Spanner call
+        mock_snapshot.execute_sql.assert_called_once()
+        args, kwargs = mock_snapshot.execute_sql.call_args
+        query = args[0]
+        self.assertIn("SELECT subject_id, name, types FROM Node", query)
+        self.assertIn("TRUE", query)
+        self.assertEqual(kwargs["params"], {"node_types": ["Topic"]})
+
+        self.assertEqual(len(nodes), 1)
+        self.assertEqual(nodes[0]["subject_id"], "dc/1")
+        self.assertEqual(nodes[0]["name"], "Node 1")
+        self.assertEqual(nodes[0]["types"], ["Topic"])
+
+    def test_get_updated_nodes_with_timestamp(self):
+        mock_database = MagicMock()
+        mock_snapshot = MagicMock()
+        mock_database.snapshot.return_value.__enter__.return_value = mock_snapshot
+
+        class MockField:
+            def __init__(self, name):
+                self.name = name
+
+        class MockResults:
+            def __init__(self, rows, field_names):
+                self.rows = rows
+                self.fields = [MockField(name) for name in field_names]
+
+            def __iter__(self):
+                return iter(self.rows)
+
+        mock_snapshot.execute_sql.return_value = MockResults(
+            rows=[("dc/2", "Node 2", ["Topic"])],
+            field_names=["subject_id", "name", "types"]
+        )
+
+        test_timestamp = datetime(2026, 4, 25, 0, 0, 0)
+        nodes = list(get_updated_nodes(mock_database, test_timestamp, ["Topic"]))
+        
+        # Verify Spanner call
+        mock_snapshot.execute_sql.assert_called_once()
+        args, kwargs = mock_snapshot.execute_sql.call_args
+        query = args[0]
+        self.assertIn("SELECT subject_id, name, types FROM Node", query)
+        self.assertIn("update_timestamp > @timestamp", query)
+        self.assertEqual(kwargs["params"], {"node_types": ["Topic"], "timestamp": test_timestamp})
+
+        self.assertEqual(len(nodes), 1)
+        self.assertEqual(nodes[0]["subject_id"], "dc/2")
+
+    def test_filter_and_convert_nodes(self):
+        nodes = [
+            {"subject_id": "dc/1", "name": "Node 1", "types": ["Topic"]},
+            {"subject_id": "dc/2", "name": None, "types": ["StatisticalVariable"]},
+            {"subject_id": "dc/3", "name": "Node 3", "types": ["Topic", "StatisticalVariable"]},
+            {"subject_id": "dc/4", "name": "", "types": ["StatisticalVariable"]}
+        ]
+
+        converted = list(filter_and_convert_nodes(nodes))
+        self.assertEqual(len(converted), 2)
+        self.assertEqual(converted[0], ("dc/1", "Node 1", ["Topic"]))
+        self.assertEqual(converted[1], ("dc/3", "Node 3", ["Topic", "StatisticalVariable"]))
+
+    @patch('embedding_utils._BATCH_SIZE', 2)
+    def test_generate_embeddings_partitioned(self):
+        mock_database = MagicMock()
+
+        nodes = [
+            ("dc/1", "Node 1", ["Topic"]),
+            ("dc/2", "Node 2", ["Topic"]),
+            ("dc/3", "Node 3", ["Topic"]),
+            ("dc/4", "Node 4", ["Topic"]),
+            ("dc/5", "Node 5", ["Topic"]),
+            ("dc/6", "Node 6", ["Topic"]),
+            ("dc/7", "Node 7", ["Topic"]),
+            ("dc/8", "Node 8", ["Topic"])
+        ]
+
+        transactions = []
+        def side_effect(func):
+            mock_transaction = MagicMock()
+            mock_transaction.execute_update.return_value = 2
+            transactions.append(mock_transaction)
+            return func(mock_transaction)
+
+        mock_database.run_in_transaction.side_effect = side_effect
+
+        affected_rows = generate_embeddings_partitioned(mock_database, nodes)
+        self.assertEqual(affected_rows, 8)
+        self.assertEqual(mock_database.run_in_transaction.call_count, 4)
+        
+        # Verify execute_update calls
+        self.assertEqual(len(transactions), 4)
+        for i, tx in enumerate(transactions):
+            tx.execute_update.assert_called_once()
+            args, kwargs = tx.execute_update.call_args
+            self.assertIn("INSERT OR UPDATE INTO NodeEmbeddings", args[0])
+            
+            # Verify batch content
+            batch = kwargs["params"]["nodes"]
+            self.assertEqual(len(batch), 2)
+            self.assertEqual(batch[0][0], f"dc/{i*2 + 1}")
+            self.assertEqual(batch[1][0], f"dc/{i*2 + 2}")
+
+if __name__ == '__main__':
+    unittest.main()
@@ -1,6 +1,7 @@
 import functions_framework
 from spanner_client import SpannerClient
 from storage_client import StorageClient
+from embedding_utils import get_latest_lock_timestamp, get_updated_nodes, filter_and_convert_nodes, generate_embeddings_partitioned
 import logging
 import os
 from absl import flags
@@ -33,6 +34,9 @@
     'enable_embeddings',
     os.environ.get('ENABLE_EMBEDDINGS', 'false').lower() == 'true',
     'Enable embeddings')
+flags.DEFINE_list(
+    'node_types', ['StatisticalVariable', 'Topic'],
+    'Node types to generate embeddings for')
 
 if not FLAGS.is_parsed():
     FLAGS(['ingestion_helper'])
@@ -214,5 +218,24 @@ def ingestion_helper(request):
                                              FLAGS.enable_embeddings)
         spanner.initialize_database(enable_embeddings=enable_embeddings)
         return ('OK', 200)
+    elif actionType == 'embedding_ingestion':
+        logging.info("Action: embedding_ingestion")
+        enable_embeddings = request_json.get('enableEmbeddings',
+                                             FLAGS.enable_embeddings)
+        if not enable_embeddings:
+            logging.info("Embeddings not enabled, skipping.")
+            return ('Invalid request on embedding ingestion.', 400)
+            
+        node_types = FLAGS.node_types
+        try:
+            logging.info(f"Job started. Fetching all nodes for types: {node_types}")
+            timestamp = get_latest_lock_timestamp(spanner.database)
+            nodes = get_updated_nodes(spanner.database, timestamp, node_types)
+            converted_nodes = filter_and_convert_nodes(nodes)
+            affected_rows = generate_embeddings_partitioned(spanner.database, converted_nodes)
+            return (f"OK [Affected rows: {affected_rows}]", 200)
+        except Exception as e:
+            logging.error(f"Embedding ingestion failed: {e}")
+            return (f"Error: {e}", 500)
     else:
         return (f'Unknown actionType: {actionType}', 400)