From 43629f073ce8f1147feaa19ae5c067e218cfe880 Mon Sep 17 00:00:00 2001 From: Nick Blumberg Date: Wed, 18 Mar 2026 13:49:02 -0700 Subject: [PATCH 1/8] Create an endpoint for facet enrichment with metadata, and update the various aspects of the application that use it to pass up the entities. --- server/routes/shared_api/metadata.py | 341 ++++++++++++++---- static/js/tools/download/page.tsx | 8 +- static/js/tools/map/compute/facets.ts | 4 +- static/js/tools/scatter/chart_loader.tsx | 7 +- .../tools/scatter/compute/facet_metadata.ts | 20 +- .../js/tools/shared/facet_choice_fetcher.ts | 21 +- .../tools/shared/metadata/metadata_fetcher.ts | 100 ++--- static/js/tools/timeline/chart.tsx | 7 +- 8 files changed, 327 insertions(+), 181 deletions(-) diff --git a/server/routes/shared_api/metadata.py b/server/routes/shared_api/metadata.py index 71444a62db..402640a614 100644 --- a/server/routes/shared_api/metadata.py +++ b/server/routes/shared_api/metadata.py @@ -15,6 +15,7 @@ import asyncio import collections import logging +import re from typing import Any from flask import Blueprint @@ -22,6 +23,7 @@ from flask import request from flask import Response +from server.lib import fetch from server.services import datacommons as dc bp = Blueprint("metadata", __name__, url_prefix='/api/metadata') @@ -34,9 +36,30 @@ # should be hidden, because it is flawed or not meaningful. MEASUREMENT_METHODS_SUPPRESSION_PROVENANCES: set[str] = {"WikipediaStatsData"} +# TODO (nick-nlb): merge the below constants with series.py. + +# Maximum number of concurrent series the server will fetch in a single chunk +_MAX_BATCH_SIZE = 2000 + +# Maps enclosed place type -> places with too many of the enclosed type. +# Determines when to pre-resolve expressions into entities for batching. +_BATCHED_CALL_PLACES = { + "CensusTract": [ + "geoId/06", # California + "geoId/12", # Florida + "geoId/36", # New York (State) + "geoId/48", # Texas + ], + "City": ["country/USA"], + "County": ["country/USA"] +} + def title_case(string: str) -> str: - return " ".join([word.capitalize() for word in string.split("_")]) + """Replaces underscores with spaces, capitalizes the first letter, and preserves acronyms.""" + return " ".join([ + word[0].upper() + word[1:] if word else "" for word in string.split("_") + ]) def _get_arc_nodes(data_dict: dict[str, Any], node_id: str, @@ -74,6 +97,101 @@ def _extract_active_facets( return list(set(active_facets)) +async def _fetch_node_data(dcids: set[str], prop: str) -> dict[str, Any]: + """Helper to fetch node data only if the list of DCIDs is not empty.""" + if not dcids: + return {} + return await asyncio.to_thread(dc.v2node, list(dcids), prop) + + +def _extract_facet_date_ranges( + obs_resp: dict, stat_vars: list[str]) -> dict[str, dict[str, str]]: + """Extracts min/max dates for facets.""" + facet_date_ranges = collections.defaultdict(dict) + by_variable = obs_resp.get('byVariable', {}) + + for sv in stat_vars: + by_entity = by_variable.get(sv, {}).get('byEntity', {}) + for ent_data in by_entity.values(): + # Aggregate Date Ranges + for f in ent_data.get('orderedFacets', []): + fid = f.get('facetId') + if not fid: + continue + + earliest, latest = f.get('earliestDate'), f.get('latestDate') + + if earliest and (not facet_date_ranges[fid].get('earliestDate') or + earliest < facet_date_ranges[fid]['earliestDate']): + facet_date_ranges[fid]['earliestDate'] = earliest + + if latest and (not facet_date_ranges[fid].get('latestDate') or + latest > facet_date_ranges[fid]['latestDate']): + facet_date_ranges[fid]['latestDate'] = latest + + return facet_date_ranges + + +async def _fetch_secondary_metadata( + provenance_endpoints: set[str], measurement_methods: set[str], + units: set[str]) -> tuple[dict, dict, dict, dict]: + """Shared helper to resolve human-readable strings from the Knowledge Graph.""" + # Look up names and descriptions of provenances, measurement methods and units + try: + prov_res, mm_res, unit_res = await asyncio.gather( + _fetch_node_data(provenance_endpoints, '->*'), + _fetch_node_data(measurement_methods, '->description'), + _fetch_node_data(units, '->name')) + except Exception: + logging.exception("Failed to fetch secondary metadata from DC") + prov_res, mm_res, unit_res = {}, {}, {} + + # Process secondary lookups + prov_map: dict[str, dict[str, Any]] = {} + linked_prov_dcids: set[str] = set() + + if 'data' in prov_res: + for dcid in prov_res['data']: + prov_map[dcid] = { + 'source': _get_arc_nodes(prov_res, dcid, 'source'), + 'isPartOf': _get_arc_nodes(prov_res, dcid, 'isPartOf'), + 'name': _get_arc_nodes(prov_res, dcid, 'name'), + 'url': _get_arc_nodes(prov_res, dcid, 'url'), + 'licenseType': _get_arc_nodes(prov_res, dcid, 'licenseType'), + } + # Collect DCIDs of linked entities for human-readable resolution + for n in prov_map[dcid]['source'] + prov_map[dcid]['isPartOf'] + prov_map[ + dcid]['licenseType']: + if 'dcid' in n: + linked_prov_dcids.add(n['dcid']) + + linked_names_map: dict[str, str] = {} + if linked_prov_dcids: + try: + linked_names_resp = await asyncio.to_thread(dc.v2node, + list(linked_prov_dcids), + '->name') + for n_dcid in linked_prov_dcids: + n_arcs = _get_arc_nodes(linked_names_resp, n_dcid, 'name') + if n_arcs: + linked_names_map[n_dcid] = n_arcs[0].get('value') + except Exception: + logging.exception("Failed to resolve linked provenance names") + + mm_map: dict[str, str] = { + mm: _get_arc_nodes(mm_res, mm, 'description')[0].get('value') + for mm in measurement_methods + if _get_arc_nodes(mm_res, mm, 'description') + } + unit_map: dict[str, str] = { + u: _get_arc_nodes(unit_res, u, 'name')[0].get('value') + for u in units + if _get_arc_nodes(unit_res, u, 'name') + } + + return prov_map, linked_names_map, mm_map, unit_map + + def _traverse_to_top_category(node: str, parent_map: dict[str, list[str]], visited: set[str], top_nodes: set[str], original_sv: str) -> None: @@ -276,13 +394,9 @@ def _build_metadata_payload( return metadata_map -async def _fetch_node_data(dcids: set[str], prop: str) -> dict[str, Any]: - """Helper to fetch node data only if the list of DCIDs is not empty.""" - if not dcids: - return {} - return await asyncio.to_thread(dc.v2node, list(dcids), prop) - - +# ============================================================================== +# Metadata Endpoint +# ============================================================================== @bp.route('', methods=['POST']) async def get_metadata() -> tuple[Response, int] | Response: # Input Validation @@ -342,7 +456,6 @@ async def get_metadata() -> tuple[Response, int] | Response: facets = obs_resp.get('facets', {}) - facet_date_ranges: dict[str, dict[str, str]] = collections.defaultdict(dict) provenance_endpoints: set[str] = set() measurement_methods: set[str] = set() units: set[str] = set() @@ -358,78 +471,158 @@ async def get_metadata() -> tuple[Response, int] | Response: if finfo.get('importName'): provenance_endpoints.add(f"dc/base/{finfo['importName']}") - # Aggregate Date Ranges - by_entity = obs_resp.get('byVariable', {}).get(sv, {}).get('byEntity', {}) - for ent_data in by_entity.values(): - for f in ent_data.get('orderedFacets', []): - if f.get('facetId') != fid: - continue + facet_date_ranges = _extract_facet_date_ranges(obs_resp, stat_vars) - earliest, latest = f.get('earliestDate'), f.get('latestDate') - if earliest and (not facet_date_ranges[fid].get('earliestDate') or - earliest < facet_date_ranges[fid]['earliestDate']): - facet_date_ranges[fid]['earliestDate'] = earliest - if latest and (not facet_date_ranges[fid].get('latestDate') or - latest > facet_date_ranges[fid]['latestDate']): - facet_date_ranges[fid]['latestDate'] = latest + prov_map, linked_names_map, mm_map, unit_map = await _fetch_secondary_metadata( + provenance_endpoints, measurement_methods, units) - # Look up names and descriptions of provenances, measurement methods and units - try: - prov_res, mm_res, unit_res = await asyncio.gather( - _fetch_node_data(provenance_endpoints, '->*'), - _fetch_node_data(measurement_methods, '->description'), - _fetch_node_data(units, '->name')) - except Exception: - logging.exception("Failed to fetch secondary metadata from DC") - return jsonify({'error': 'Failed to resolve secondary node data'}), 502 + # Assemble and return the final response + metadata_map = _build_metadata_payload(stat_vars, stat_var_names, + category_map, sv_active_facets, facets, + facet_date_ranges, prov_map, + linked_names_map, mm_map, unit_map) - # Process secondary lookups - prov_map: dict[str, dict[str, Any]] = {} - linked_prov_dcids: set[str] = set() + return jsonify({'metadata': metadata_map, 'statVarList': stat_var_list}) - if 'data' in prov_res: - for dcid in prov_res['data']: - prov_map[dcid] = { - 'source': _get_arc_nodes(prov_res, dcid, 'source'), - 'isPartOf': _get_arc_nodes(prov_res, dcid, 'isPartOf'), - 'name': _get_arc_nodes(prov_res, dcid, 'name'), - 'url': _get_arc_nodes(prov_res, dcid, 'url'), - 'licenseType': _get_arc_nodes(prov_res, dcid, 'licenseType'), - } - # Collect DCIDs of linked entities for human-readable resolution - for n in prov_map[dcid]['source'] + prov_map[dcid]['isPartOf'] + prov_map[ - dcid]['licenseType']: - if 'dcid' in n: - linked_prov_dcids.add(n['dcid']) - linked_names_map: dict[str, str] = {} - if linked_prov_dcids: +# ============================================================================== +# Facet Metadata Enrichment +# ============================================================================== +@bp.route('/facets', methods=['POST']) +async def enrich_facets() -> tuple[Response, int] | Response: + """Endpoint to enrich a dictionary of facets with dates and metadata.""" + req_data = request.get_json(silent=True) + if not req_data: + return jsonify({'error': 'Must provide a valid JSON body'}), 400 + + facets = req_data.get('facets', {}) + stat_vars = req_data.get('statVars', []) + entities = req_data.get('entities', []) + entity_expression = req_data.get('entityExpression', '') + + if not facets: + return jsonify({}) + + obs_resp = {'byVariable': {}} + if stat_vars and (entities or entity_expression): + + # 1. Resolve large expressions explicitly so we can safely chunk entities + if entity_expression: + match = re.search(r'(.*)<-containedInPlace\+\{typeOf:(.*)\}', + entity_expression) + if match: + parent_entity = match.group(1) + child_type = match.group(2) + if parent_entity in _BATCHED_CALL_PLACES.get(child_type, []): + try: + logging.info( + f"Resolving massive entity expression for batching: {entity_expression}" + ) + child_places_resp = await asyncio.to_thread(fetch.descendent_places, + [parent_entity], + child_type) + entities = child_places_resp.get(parent_entity, []) + entity_expression = "" + except Exception: + logging.exception( + "Failed to resolve descendent places for batching") + + # 2. Establish base query kwargs + base_kwargs = {'select': ['entity', 'variable', 'facet']} + + all_fids = [] + for sv_facets in facets.values(): + all_fids.extend(list(sv_facets.keys())) + all_fids = list(set(all_fids)) + if all_fids: + base_kwargs['filter'] = {'facetIds': all_fids} + + tasks = [] + + # 3. Create observation tasks based on whether we are batching + if entity_expression: + # At this stage, we know this is an entity expression not found in the batch constants + kwargs = base_kwargs.copy() + kwargs['entity'] = {'expression': entity_expression} + kwargs['variable'] = {'dcids': stat_vars} + tasks.append(asyncio.to_thread(dc.v2observation, **kwargs)) + + elif entities: + # We have explicit entities + # Chunk them so len(entities) * len(stat_vars) <= _MAX_BATCH_SIZE + ent_batch_size = max(1, _MAX_BATCH_SIZE // max(1, len(stat_vars))) + for i in range(0, len(entities), ent_batch_size): + kwargs = base_kwargs.copy() + kwargs['entity'] = {'dcids': entities[i:i + ent_batch_size]} + kwargs['variable'] = {'dcids': stat_vars} + tasks.append(asyncio.to_thread(dc.v2observation, **kwargs)) + + # 4. Run tasks and then merge try: - linked_names_resp = await asyncio.to_thread(dc.v2node, - list(linked_prov_dcids), - '->name') - for n_dcid in linked_prov_dcids: - n_arcs = _get_arc_nodes(linked_names_resp, n_dcid, 'name') - if n_arcs: - linked_names_map[n_dcid] = n_arcs[0].get('value') + if tasks: + chunk_results = await asyncio.gather(*tasks) + + for res in chunk_results: + if not res or 'byVariable' not in res: + continue + for sv, sv_data in res.get('byVariable', {}).items(): + if sv not in obs_resp['byVariable']: + obs_resp['byVariable'][sv] = {'byEntity': {}} + for ent, ent_data in sv_data.get('byEntity', {}).items(): + if ent not in obs_resp['byVariable'][sv]['byEntity']: + obs_resp['byVariable'][sv]['byEntity'][ent] = { + 'orderedFacets': [] + } + obs_resp['byVariable'][sv]['byEntity'][ent][ + 'orderedFacets'].extend(ent_data.get('orderedFacets', [])) except Exception: - logging.exception("Failed to resolve linked provenance names") + logging.exception( + "v2observation failed in enrich_facets. Date ranges may be missing.") + obs_resp = {} - mm_map: dict[str, str] = { - mm: _get_arc_nodes(mm_res, mm, 'description')[0].get('value') - for mm in measurement_methods - if _get_arc_nodes(mm_res, mm, 'description') - } - unit_map: dict[str, str] = { - u: _get_arc_nodes(unit_res, u, 'name')[0].get('value') - for u in units - if _get_arc_nodes(unit_res, u, 'name') - } + facet_date_ranges = _extract_facet_date_ranges(obs_resp, stat_vars) - # Assemble and return the final response - metadata_map = _build_metadata_payload(stat_vars, stat_var_names, - category_map, sv_active_facets, facets, - facet_date_ranges, prov_map, - linked_names_map, mm_map, unit_map) + provenance_endpoints = set() + measurement_methods = set() + units = set() - return jsonify({'metadata': metadata_map, 'statVarList': stat_var_list}) + for sv, sv_facets in facets.items(): + for fid, finfo in sv_facets.items(): + if finfo.get('importName'): + provenance_endpoints.add(f"dc/base/{finfo['importName']}") + if finfo.get('measurementMethod'): + measurement_methods.add(finfo['measurementMethod']) + if finfo.get('unit'): + units.add(finfo['unit']) + + prov_map, linked_names_map, mm_map, unit_map = await _fetch_secondary_metadata( + provenance_endpoints, measurement_methods, units) + + for sv, sv_facets in facets.items(): + for fid, finfo in sv_facets.items(): + dr = facet_date_ranges.get(fid, {}) + if dr.get('earliestDate'): + finfo['dateRangeStart'] = dr.get('earliestDate') + if dr.get('latestDate'): + finfo['dateRangeEnd'] = dr.get('latestDate') + + import_name = finfo.get('importName') + if import_name: + prov_id = f"dc/base/{import_name}" + pdata = prov_map.get(prov_id) + if pdata: + finfo['sourceName'] = _get_node_name(pdata.get('source', []), + linked_names_map) + finfo['provenanceName'] = _get_node_name(pdata.get('isPartOf', []), linked_names_map) or \ + _get_node_name(pdata.get('name', []), linked_names_map) or import_name + + mm = finfo.get('measurementMethod') + if mm and finfo.get( + 'provenanceName') not in MEASUREMENT_METHODS_SUPPRESSION_PROVENANCES: + finfo['measurementMethodDescription'] = mm_map.get(mm) or title_case(mm) + + unit = finfo.get('unit') + if unit: + finfo['unitDisplayName'] = unit_map.get(unit) or unit.replace('_', ' ') + + return jsonify(facets) diff --git a/static/js/tools/download/page.tsx b/static/js/tools/download/page.tsx index c215b22e64..483769dd4c 100644 --- a/static/js/tools/download/page.tsx +++ b/static/js/tools/download/page.tsx @@ -186,10 +186,10 @@ export function Page(props: PagePropType): ReactElement { } } - const enrichedFacets = await fetchFacetsWithMetadata( - baseFacets, - dataCommonsClient - ); + const entityExpression = `${selectedOptions.selectedPlace.dcid}<-containedInPlace+{typeOf:${selectedOptions.enclosedPlaceType}}`; + const enrichedFacets = await fetchFacetsWithMetadata(baseFacets, { + entityExpression, + }); const sourceSelectorFacetList = []; for (const sv in enrichedFacets) { diff --git a/static/js/tools/map/compute/facets.ts b/static/js/tools/map/compute/facets.ts index 4835958219..ea94d16ce3 100644 --- a/static/js/tools/map/compute/facets.ts +++ b/static/js/tools/map/compute/facets.ts @@ -72,7 +72,9 @@ export function useComputeFacetList(chartStore: ChartStore): { } } - fetchFacetsWithMetadata(baseFacets, dataCommonsClient) + fetchFacetsWithMetadata(baseFacets, { + entities: data ? Object.keys(data) : [], + }) .then((enrichedMap) => { if (!enrichedMap[svDcid]) { setFacetList([]); diff --git a/static/js/tools/scatter/chart_loader.tsx b/static/js/tools/scatter/chart_loader.tsx index 5fd38fc692..29d081b461 100644 --- a/static/js/tools/scatter/chart_loader.tsx +++ b/static/js/tools/scatter/chart_loader.tsx @@ -99,8 +99,13 @@ export function ChartLoader(): ReactElement { const cache = useCache(); const chartData = useChartData(cache); + const entityExpression = + place.value.enclosingPlace.dcid && place.value.enclosedPlaceType + ? `${place.value.enclosingPlace.dcid}<-containedInPlace+{typeOf:${place.value.enclosedPlaceType}}` + : undefined; + const { facetSelectorMetadata, facetListLoading, facetListError } = - useFacetMetadata(cache?.baseFacets || null); + useFacetMetadata(cache?.baseFacets || null, { entityExpression }); const containerRef = useRef(null); const embedModalElement = useRef(null); diff --git a/static/js/tools/scatter/compute/facet_metadata.ts b/static/js/tools/scatter/compute/facet_metadata.ts index 2d4ac6a56f..60be3afa4c 100644 --- a/static/js/tools/scatter/compute/facet_metadata.ts +++ b/static/js/tools/scatter/compute/facet_metadata.ts @@ -21,8 +21,6 @@ import _ from "lodash"; import { useEffect, useState } from "react"; -import { WEBSITE_SURFACE } from "../../../shared/constants"; -import { getDataCommonsClient } from "../../../utils/data_commons_client"; import { FacetResponse } from "../../../utils/data_fetch_utils"; import { fetchFacetsWithMetadata } from "../../shared/metadata/metadata_fetcher"; @@ -44,7 +42,8 @@ type FacetMetadataReturn = { * error state. */ export function useFacetMetadata( - baseFacets: FacetResponse | null + baseFacets: FacetResponse | null, + entityContext: { entities?: string[]; entityExpression?: string } = {} ): FacetMetadataReturn { const [facetMetadata, setFacetMetadata] = useState({ facetSelectorMetadata: {}, @@ -52,6 +51,10 @@ export function useFacetMetadata( facetListError: false, }); + // Extract the primitive values for safe dependency tracking + const { entities, entityExpression } = entityContext; + const entitiesString = entities?.join(","); + useEffect(() => { if (_.isEmpty(baseFacets)) return; @@ -64,10 +67,11 @@ export function useFacetMetadata( })); try { - const resp = await fetchFacetsWithMetadata( - baseFacets, - getDataCommonsClient(null, WEBSITE_SURFACE) - ); + // Pass the extracted values back into the fetcher + const resp = await fetchFacetsWithMetadata(baseFacets, { + entities: entitiesString ? entitiesString.split(",") : undefined, + entityExpression, + }); if (cancelled) return; @@ -92,7 +96,7 @@ export function useFacetMetadata( return () => { cancelled = true; }; - }, [baseFacets]); + }, [baseFacets, entitiesString, entityExpression]); return facetMetadata; } diff --git a/static/js/tools/shared/facet_choice_fetcher.ts b/static/js/tools/shared/facet_choice_fetcher.ts index 3090a79113..cdfc6147e0 100644 --- a/static/js/tools/shared/facet_choice_fetcher.ts +++ b/static/js/tools/shared/facet_choice_fetcher.ts @@ -20,7 +20,6 @@ import { WEBSITE_SURFACE } from "../../shared/constants"; import { FacetSelectorFacetInfo } from "../../shared/facet_selector/facet_selector"; -import { getDataCommonsClient } from "../../utils/data_commons_client"; import { getFacets, getFacetsWithin } from "../../utils/data_fetch_utils"; import { fetchFacetsWithMetadata } from "./metadata/metadata_fetcher"; @@ -41,17 +40,15 @@ export async function fetchFacetChoices( placeDcids: string[], statVars: { dcid: string; name?: string }[] ): Promise { - const dataCommonsClient = getDataCommonsClient(null, WEBSITE_SURFACE); const baseFacets = await getFacets( "", placeDcids, statVars.map((sv) => sv.dcid), WEBSITE_SURFACE ); - const enrichedFacets = await fetchFacetsWithMetadata( - baseFacets, - dataCommonsClient - ); + const enrichedFacets = await fetchFacetsWithMetadata(baseFacets, { + entities: placeDcids, + }); return statVars.map((sv) => ({ dcid: sv.dcid, name: sv.name || sv.dcid, @@ -73,7 +70,6 @@ export async function fetchFacetChoicesWithin( enclosedPlaceType: string, statVars: { dcid: string; name?: string; date?: string }[] ): Promise { - const dataCommonsClient = getDataCommonsClient(null, WEBSITE_SURFACE); const facetPromises = statVars.map((sv) => getFacetsWithin( "", @@ -85,10 +81,13 @@ export async function fetchFacetChoicesWithin( ) ); const baseFacets = Object.assign({}, ...(await Promise.all(facetPromises))); - const enrichedFacets = await fetchFacetsWithMetadata( - baseFacets, - dataCommonsClient - ); + + // Construct the expression to allow accurate date resolution on the backend + const entityExpression = `${parentPlace}<-containedInPlace+{typeOf:${enclosedPlaceType}}`; + + const enrichedFacets = await fetchFacetsWithMetadata(baseFacets, { + entityExpression, + }); return statVars.map((sv) => ({ dcid: sv.dcid, name: sv.name || sv.dcid, diff --git a/static/js/tools/shared/metadata/metadata_fetcher.ts b/static/js/tools/shared/metadata/metadata_fetcher.ts index e250a05fc0..83d3850910 100644 --- a/static/js/tools/shared/metadata/metadata_fetcher.ts +++ b/static/js/tools/shared/metadata/metadata_fetcher.ts @@ -290,94 +290,38 @@ async function fetchNodeProperty( * Fetches and enriches facet metadata with human-readable and supplementary * information like source names, date ranges, and descriptions. * @param facets The basic facet response from an API call like getFacets. - * @param dataCommonsClient Client for Data Commons API calls. + * @param entityContext The specific entities or expression used to determine dates. * @param apiRoot Optional API root URL for requests. * @returns The facets with enriched StatMetadata. */ export async function fetchFacetsWithMetadata( facets: FacetResponse, - dataCommonsClient: DataCommonsClient, + entityContext: { entities?: string[]; entityExpression?: string }, apiRoot = "" ): Promise { const statVars = Object.keys(facets); - if (!statVars.length) return {}; - - const measurementMethods = new Set(); - const units = new Set(); - const statVarSet = new Set(); - for (const sv in facets) { - statVarSet.add(sv); - for (const facetId in facets[sv]) { - const facet = facets[sv][facetId]; - if (facet.measurementMethod) { - measurementMethods.add(facet.measurementMethod); - } - if (facet.unit) { - units.add(facet.unit); - } - } - } - - const [provenanceMap, variableData, measurementMethodMap, unitMap] = - await Promise.all([ - fetchProvenanceInfo(statVars, facets, undefined, apiRoot), - fetchStatVarProvenanceSummaries(statVars, apiRoot), - fetchNodeProperty(measurementMethods, "description", dataCommonsClient), - fetchNodeProperty(units, "name", dataCommonsClient), - ]); - - const enrichedFacets: FacetResponse = {}; - for (const statVarId of statVars) { - enrichedFacets[statVarId] = {}; - for (const facetId in facets[statVarId]) { - const facetInfo = facets[statVarId][facetId]; - const newFacetInfo = { ...facetInfo }; - - if (facetInfo.importName) { - const provenanceId = `dc/base/${facetInfo.importName}`; - const provenanceData = provenanceMap[provenanceId]; - if (provenanceData) { - newFacetInfo.sourceName = provenanceData.source?.[0]?.name; - newFacetInfo.provenanceName = - provenanceData.isPartOf?.[0]?.name || - provenanceData.name?.[0]?.value || - facetInfo.importName; - } - const seriesList = - variableData[statVarId]?.provenanceSummary?.[provenanceId] - ?.seriesSummary; - const matchedSeries = Array.isArray(seriesList) - ? matchSeriesByFacet(seriesList, facetInfo) - : undefined; - if ( - matchedSeries && - !DATE_RANGE_SUPPRESSION_PROVENANCES.includes( - newFacetInfo.provenanceName - ) - ) { - newFacetInfo.dateRangeStart = matchedSeries.earliestDate; - newFacetInfo.dateRangeEnd = matchedSeries.latestDate; - } - } - - if ( - facetInfo.measurementMethod && - !MEASUREMENT_METHODS_SUPPRESSION_PROVENANCES.includes( - newFacetInfo.provenanceName - ) - ) { - newFacetInfo.measurementMethodDescription = - measurementMethodMap[facetInfo.measurementMethod] || - startCase(facetInfo.measurementMethod.replace(/_/g, " ")); - } - if (facetInfo.unit) { - newFacetInfo.unitDisplayName = - unitMap[facetInfo.unit] || facetInfo.unit.replace(/_/g, " "); - } - enrichedFacets[statVarId][facetId] = newFacetInfo; + if (!statVars.length) return facets; + + try { + const response = await fetch(`${apiRoot}/api/metadata/facets`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + facets, + statVars, + entities: entityContext.entities, + entityExpression: entityContext.entityExpression, + }), + }); + if (!response.ok) { + console.error("Failed to enrich facets via API"); + return facets; } + return await response.json(); + } catch (e) { + console.error("Error enriching facets:", e); + return facets; } - return enrichedFacets; } /** diff --git a/static/js/tools/timeline/chart.tsx b/static/js/tools/timeline/chart.tsx index 2a0e7c091a..e9411bd007 100644 --- a/static/js/tools/timeline/chart.tsx +++ b/static/js/tools/timeline/chart.tsx @@ -466,10 +466,9 @@ class Chart extends Component { metadataMap: Record> ): Promise { try { - const enriched = await fetchFacetsWithMetadata( - metadataMap, - this.dataCommonsClient - ); + const enriched = await fetchFacetsWithMetadata(metadataMap, { + entities: Object.keys(this.props.placeNameMap), + }); const facetList = this.getFacetList(statVars, enriched); this.setState({ facetList, facetListLoading: false }); } catch { From 16a795269ab2104eff5eea99cd433d771d3376d1 Mon Sep 17 00:00:00 2001 From: Nick Blumberg Date: Wed, 18 Mar 2026 14:05:39 -0700 Subject: [PATCH 2/8] Send the enclosed place type and parent place explicitly into the facet endpoint rather than constructing the string in the frontend. --- server/routes/shared_api/metadata.py | 39 +++++++++---------- static/js/tools/download/page.tsx | 4 +- static/js/tools/scatter/chart_loader.tsx | 11 ++---- .../tools/scatter/compute/facet_metadata.ts | 13 +++++-- .../js/tools/shared/facet_choice_fetcher.ts | 6 +-- .../tools/shared/metadata/metadata_fetcher.ts | 9 ++++- 6 files changed, 42 insertions(+), 40 deletions(-) diff --git a/server/routes/shared_api/metadata.py b/server/routes/shared_api/metadata.py index 402640a614..c4b1a7c2af 100644 --- a/server/routes/shared_api/metadata.py +++ b/server/routes/shared_api/metadata.py @@ -498,34 +498,31 @@ async def enrich_facets() -> tuple[Response, int] | Response: facets = req_data.get('facets', {}) stat_vars = req_data.get('statVars', []) entities = req_data.get('entities', []) - entity_expression = req_data.get('entityExpression', '') + parent_place = req_data.get('parentPlace', '') + enclosed_place_type = req_data.get('enclosedPlaceType', '') if not facets: return jsonify({}) obs_resp = {'byVariable': {}} - if stat_vars and (entities or entity_expression): + entity_expression = "" + + if stat_vars and (entities or (parent_place and enclosed_place_type)): # 1. Resolve large expressions explicitly so we can safely chunk entities - if entity_expression: - match = re.search(r'(.*)<-containedInPlace\+\{typeOf:(.*)\}', - entity_expression) - if match: - parent_entity = match.group(1) - child_type = match.group(2) - if parent_entity in _BATCHED_CALL_PLACES.get(child_type, []): - try: - logging.info( - f"Resolving massive entity expression for batching: {entity_expression}" - ) - child_places_resp = await asyncio.to_thread(fetch.descendent_places, - [parent_entity], - child_type) - entities = child_places_resp.get(parent_entity, []) - entity_expression = "" - except Exception: - logging.exception( - "Failed to resolve descendent places for batching") + if parent_place and enclosed_place_type: + if parent_place in _BATCHED_CALL_PLACES.get(enclosed_place_type, []): + try: + child_places_resp = await asyncio.to_thread(fetch.descendent_places, + [parent_place], + enclosed_place_type) + entities = child_places_resp.get(parent_place, []) + entity_expression = "" + except Exception: + logging.exception("Failed to resolve descendent places for batching") + entity_expression = f"{parent_place}<-containedInPlace+{{typeOf:{enclosed_place_type}}}" + else: + entity_expression = f"{parent_place}<-containedInPlace+{{typeOf:{enclosed_place_type}}}" # 2. Establish base query kwargs base_kwargs = {'select': ['entity', 'variable', 'facet']} diff --git a/static/js/tools/download/page.tsx b/static/js/tools/download/page.tsx index 483769dd4c..13a71ea0cd 100644 --- a/static/js/tools/download/page.tsx +++ b/static/js/tools/download/page.tsx @@ -186,9 +186,9 @@ export function Page(props: PagePropType): ReactElement { } } - const entityExpression = `${selectedOptions.selectedPlace.dcid}<-containedInPlace+{typeOf:${selectedOptions.enclosedPlaceType}}`; const enrichedFacets = await fetchFacetsWithMetadata(baseFacets, { - entityExpression, + parentPlace: selectedOptions.selectedPlace.dcid, + enclosedPlaceType: selectedOptions.enclosedPlaceType, }); const sourceSelectorFacetList = []; diff --git a/static/js/tools/scatter/chart_loader.tsx b/static/js/tools/scatter/chart_loader.tsx index 29d081b461..dbf74c8286 100644 --- a/static/js/tools/scatter/chart_loader.tsx +++ b/static/js/tools/scatter/chart_loader.tsx @@ -98,14 +98,11 @@ export function ChartLoader(): ReactElement { const { x, y, place, display } = useContext(Context); const cache = useCache(); const chartData = useChartData(cache); - - const entityExpression = - place.value.enclosingPlace.dcid && place.value.enclosedPlaceType - ? `${place.value.enclosingPlace.dcid}<-containedInPlace+{typeOf:${place.value.enclosedPlaceType}}` - : undefined; - const { facetSelectorMetadata, facetListLoading, facetListError } = - useFacetMetadata(cache?.baseFacets || null, { entityExpression }); + useFacetMetadata(cache?.baseFacets || null, { + parentPlace: place.value.enclosingPlace.dcid, + enclosedPlaceType: place.value.enclosedPlaceType, + }); const containerRef = useRef(null); const embedModalElement = useRef(null); diff --git a/static/js/tools/scatter/compute/facet_metadata.ts b/static/js/tools/scatter/compute/facet_metadata.ts index 60be3afa4c..f7492f82a8 100644 --- a/static/js/tools/scatter/compute/facet_metadata.ts +++ b/static/js/tools/scatter/compute/facet_metadata.ts @@ -43,7 +43,11 @@ type FacetMetadataReturn = { */ export function useFacetMetadata( baseFacets: FacetResponse | null, - entityContext: { entities?: string[]; entityExpression?: string } = {} + entityContext: { + entities?: string[]; + parentPlace?: string; + enclosedPlaceType?: string; + } = {} ): FacetMetadataReturn { const [facetMetadata, setFacetMetadata] = useState({ facetSelectorMetadata: {}, @@ -52,7 +56,7 @@ export function useFacetMetadata( }); // Extract the primitive values for safe dependency tracking - const { entities, entityExpression } = entityContext; + const { entities, parentPlace, enclosedPlaceType } = entityContext; const entitiesString = entities?.join(","); useEffect(() => { @@ -70,7 +74,8 @@ export function useFacetMetadata( // Pass the extracted values back into the fetcher const resp = await fetchFacetsWithMetadata(baseFacets, { entities: entitiesString ? entitiesString.split(",") : undefined, - entityExpression, + parentPlace, + enclosedPlaceType, }); if (cancelled) return; @@ -96,7 +101,7 @@ export function useFacetMetadata( return () => { cancelled = true; }; - }, [baseFacets, entitiesString, entityExpression]); + }, [baseFacets, entitiesString, parentPlace, enclosedPlaceType]); return facetMetadata; } diff --git a/static/js/tools/shared/facet_choice_fetcher.ts b/static/js/tools/shared/facet_choice_fetcher.ts index cdfc6147e0..f693a30e27 100644 --- a/static/js/tools/shared/facet_choice_fetcher.ts +++ b/static/js/tools/shared/facet_choice_fetcher.ts @@ -82,11 +82,9 @@ export async function fetchFacetChoicesWithin( ); const baseFacets = Object.assign({}, ...(await Promise.all(facetPromises))); - // Construct the expression to allow accurate date resolution on the backend - const entityExpression = `${parentPlace}<-containedInPlace+{typeOf:${enclosedPlaceType}}`; - const enrichedFacets = await fetchFacetsWithMetadata(baseFacets, { - entityExpression, + parentPlace, + enclosedPlaceType, }); return statVars.map((sv) => ({ dcid: sv.dcid, diff --git a/static/js/tools/shared/metadata/metadata_fetcher.ts b/static/js/tools/shared/metadata/metadata_fetcher.ts index 83d3850910..5074c85883 100644 --- a/static/js/tools/shared/metadata/metadata_fetcher.ts +++ b/static/js/tools/shared/metadata/metadata_fetcher.ts @@ -296,7 +296,11 @@ async function fetchNodeProperty( */ export async function fetchFacetsWithMetadata( facets: FacetResponse, - entityContext: { entities?: string[]; entityExpression?: string }, + entityContext: { + entities?: string[]; + parentPlace?: string; + enclosedPlaceType?: string; + }, apiRoot = "" ): Promise { const statVars = Object.keys(facets); @@ -310,7 +314,8 @@ export async function fetchFacetsWithMetadata( facets, statVars, entities: entityContext.entities, - entityExpression: entityContext.entityExpression, + parentPlace: entityContext.parentPlace, + enclosedPlaceType: entityContext.enclosedPlaceType, }), }); if (!response.ok) { From 6839c0126a0aea08c2fc8a6266edc0bcf6740ee0 Mon Sep 17 00:00:00 2001 From: Nick Blumberg Date: Wed, 18 Mar 2026 14:09:51 -0700 Subject: [PATCH 3/8] Add gate to prevent too many stat vars being sent into the request (very unlikely) --- server/routes/shared_api/metadata.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/server/routes/shared_api/metadata.py b/server/routes/shared_api/metadata.py index c4b1a7c2af..5d6f81cdd1 100644 --- a/server/routes/shared_api/metadata.py +++ b/server/routes/shared_api/metadata.py @@ -412,6 +412,12 @@ async def get_metadata() -> tuple[Response, int] | Response: if not isinstance(entities, list) or not isinstance(stat_vars, list): return jsonify({'error': 'entities and statVars must be lists'}), 400 + if len(stat_vars) > _MAX_BATCH_SIZE: + return jsonify({ + 'error': + f'Too many Statistical Variables requested. Maximum allowed is {_MAX_BATCH_SIZE}' + }), 400 + if not entities or not stat_vars: return jsonify({'metadata': {}, 'statVarList': []}) @@ -501,6 +507,12 @@ async def enrich_facets() -> tuple[Response, int] | Response: parent_place = req_data.get('parentPlace', '') enclosed_place_type = req_data.get('enclosedPlaceType', '') + if len(stat_vars) > _MAX_BATCH_SIZE: + return jsonify({ + 'error': + f'Too many Statistical Variables requested. Maximum allowed is {_MAX_BATCH_SIZE}' + }), 400 + if not facets: return jsonify({}) From 1fa4087ea5aec6de12752fbe4bd5131d2e949404 Mon Sep 17 00:00:00 2001 From: Nick Blumberg Date: Wed, 18 Mar 2026 14:17:16 -0700 Subject: [PATCH 4/8] Clean up dictionary merging. --- server/routes/shared_api/metadata.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/server/routes/shared_api/metadata.py b/server/routes/shared_api/metadata.py index 5d6f81cdd1..a01b466c30 100644 --- a/server/routes/shared_api/metadata.py +++ b/server/routes/shared_api/metadata.py @@ -575,15 +575,11 @@ async def enrich_facets() -> tuple[Response, int] | Response: if not res or 'byVariable' not in res: continue for sv, sv_data in res.get('byVariable', {}).items(): - if sv not in obs_resp['byVariable']: - obs_resp['byVariable'][sv] = {'byEntity': {}} + entity_dict = obs_resp['byVariable'].setdefault(sv, {}).setdefault('byEntity', {}) for ent, ent_data in sv_data.get('byEntity', {}).items(): - if ent not in obs_resp['byVariable'][sv]['byEntity']: - obs_resp['byVariable'][sv]['byEntity'][ent] = { - 'orderedFacets': [] - } - obs_resp['byVariable'][sv]['byEntity'][ent][ - 'orderedFacets'].extend(ent_data.get('orderedFacets', [])) + entity_dict.setdefault(ent, {'orderedFacets': []})['orderedFacets'].extend( + ent_data.get('orderedFacets', []) + ) except Exception: logging.exception( "v2observation failed in enrich_facets. Date ranges may be missing.") From 623bb59c060b4540d54108b67b18a86b28fb8455 Mon Sep 17 00:00:00 2001 From: Nick Blumberg Date: Wed, 18 Mar 2026 14:46:26 -0700 Subject: [PATCH 5/8] Remove unused endpoint. --- server/routes/shared_api/metadata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/server/routes/shared_api/metadata.py b/server/routes/shared_api/metadata.py index a01b466c30..46c6dce3ae 100644 --- a/server/routes/shared_api/metadata.py +++ b/server/routes/shared_api/metadata.py @@ -15,7 +15,6 @@ import asyncio import collections import logging -import re from typing import Any from flask import Blueprint From cc014e1617db09d6170fd72b330d06330f1540f3 Mon Sep 17 00:00:00 2001 From: Nick Blumberg Date: Wed, 18 Mar 2026 14:55:32 -0700 Subject: [PATCH 6/8] Simplify enclosed place logic in endpoint. --- server/routes/shared_api/metadata.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/server/routes/shared_api/metadata.py b/server/routes/shared_api/metadata.py index 46c6dce3ae..c8460abb48 100644 --- a/server/routes/shared_api/metadata.py +++ b/server/routes/shared_api/metadata.py @@ -522,18 +522,21 @@ async def enrich_facets() -> tuple[Response, int] | Response: # 1. Resolve large expressions explicitly so we can safely chunk entities if parent_place and enclosed_place_type: + # Default to using an entity expression. + entity_expression = f"{parent_place}<-containedInPlace+{{typeOf:{enclosed_place_type}}}" + # If the place is known to have many children, pre-fetch entities for batching. if parent_place in _BATCHED_CALL_PLACES.get(enclosed_place_type, []): try: child_places_resp = await asyncio.to_thread(fetch.descendent_places, [parent_place], enclosed_place_type) entities = child_places_resp.get(parent_place, []) + # If successful, clear the expression to use the explicit entity list. entity_expression = "" except Exception: - logging.exception("Failed to resolve descendent places for batching") - entity_expression = f"{parent_place}<-containedInPlace+{{typeOf:{enclosed_place_type}}}" - else: - entity_expression = f"{parent_place}<-containedInPlace+{{typeOf:{enclosed_place_type}}}" + logging.exception( + "Failed to resolve descendent places for batching. Falling back to entity expression." + ) # 2. Establish base query kwargs base_kwargs = {'select': ['entity', 'variable', 'facet']} From 9adeab4543120904f4ec012b46458ab820b9c7f3 Mon Sep 17 00:00:00 2001 From: Nick Blumberg Date: Wed, 18 Mar 2026 15:10:40 -0700 Subject: [PATCH 7/8] Lint --- server/routes/shared_api/metadata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/routes/shared_api/metadata.py b/server/routes/shared_api/metadata.py index c8460abb48..2980c1235f 100644 --- a/server/routes/shared_api/metadata.py +++ b/server/routes/shared_api/metadata.py @@ -577,11 +577,11 @@ async def enrich_facets() -> tuple[Response, int] | Response: if not res or 'byVariable' not in res: continue for sv, sv_data in res.get('byVariable', {}).items(): - entity_dict = obs_resp['byVariable'].setdefault(sv, {}).setdefault('byEntity', {}) + entity_dict = obs_resp['byVariable'].setdefault(sv, {}).setdefault( + 'byEntity', {}) for ent, ent_data in sv_data.get('byEntity', {}).items(): - entity_dict.setdefault(ent, {'orderedFacets': []})['orderedFacets'].extend( - ent_data.get('orderedFacets', []) - ) + entity_dict.setdefault(ent, {'orderedFacets': [ + ]})['orderedFacets'].extend(ent_data.get('orderedFacets', [])) except Exception: logging.exception( "v2observation failed in enrich_facets. Date ranges may be missing.") From 5e2778ed3dbb5676c4d57ab8b3ed7905a0a7360c Mon Sep 17 00:00:00 2001 From: Nick Blumberg Date: Thu, 19 Mar 2026 09:03:01 -0700 Subject: [PATCH 8/8] Improve commenting in the metadata.py file. --- server/routes/shared_api/metadata.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/server/routes/shared_api/metadata.py b/server/routes/shared_api/metadata.py index 2980c1235f..c2dbfb6e89 100644 --- a/server/routes/shared_api/metadata.py +++ b/server/routes/shared_api/metadata.py @@ -36,6 +36,7 @@ MEASUREMENT_METHODS_SUPPRESSION_PROVENANCES: set[str] = {"WikipediaStatsData"} # TODO (nick-nlb): merge the below constants with series.py. +# TODO (nick-nlb): factor out the commonly used functions into a separate file # Maximum number of concurrent series the server will fetch in a single chunk _MAX_BATCH_SIZE = 2000 @@ -120,6 +121,8 @@ def _extract_facet_date_ranges( earliest, latest = f.get('earliestDate'), f.get('latestDate') + # If this date would expand the earliest or latest date boundaries of the facet + # then we expand the facet date boundaries to match. if earliest and (not facet_date_ranges[fid].get('earliestDate') or earliest < facet_date_ranges[fid]['earliestDate']): facet_date_ranges[fid]['earliestDate'] = earliest @@ -593,6 +596,7 @@ async def enrich_facets() -> tuple[Response, int] | Response: measurement_methods = set() units = set() + # Collect unique references across all facets to batch-fetch their secondary metadata. for sv, sv_facets in facets.items(): for fid, finfo in sv_facets.items(): if finfo.get('importName'): @@ -605,14 +609,17 @@ async def enrich_facets() -> tuple[Response, int] | Response: prov_map, linked_names_map, mm_map, unit_map = await _fetch_secondary_metadata( provenance_endpoints, measurement_methods, units) + # Enrich the original facets with the resolved metadata and calculated date boundaries. for sv, sv_facets in facets.items(): for fid, finfo in sv_facets.items(): dr = facet_date_ranges.get(fid, {}) + # Apply the expanded min/max date boundaries. if dr.get('earliestDate'): finfo['dateRangeStart'] = dr.get('earliestDate') if dr.get('latestDate'): finfo['dateRangeEnd'] = dr.get('latestDate') + # Resolve human-readable source and publisher names from the provenance. import_name = finfo.get('importName') if import_name: prov_id = f"dc/base/{import_name}" @@ -623,6 +630,7 @@ async def enrich_facets() -> tuple[Response, int] | Response: finfo['provenanceName'] = _get_node_name(pdata.get('isPartOf', []), linked_names_map) or \ _get_node_name(pdata.get('name', []), linked_names_map) or import_name + # Attach descriptive display names for the measurement method and unit. mm = finfo.get('measurementMethod') if mm and finfo.get( 'provenanceName') not in MEASUREMENT_METHODS_SUPPRESSION_PROVENANCES: