datacommonsorg · HarishC727 · Mar 2, 2026 · Mar 6, 2026 · gemini-code-assist · Mar 2, 2026
diff --git a/scripts/us_census/acs5yr/subject_tables/common/generate_col_map.py b/scripts/us_census/acs5yr/subject_tables/common/generate_col_map.py
@@ -166,23 +166,38 @@ def __init__(self, spec_dict={}, column_list=[], delimiter='!!'):
 
     def _find_and_replace_column_names(self, column):
         """
-        if spec has find_and_replace defined, this function updates column names
+        Final robust version: Handles long keys containing delimiters 
+        and multiple individual token replacements.
         """
-        if 'find_and_replace' in self.features['preprocess']:
-            find_and_replace_dict = self.features['preprocess'][
-                'find_and_replace']
-            # replace entire column name
-            if column in find_and_replace_dict:
-                return find_and_replace_dict[column]
-            # replace a token in the column name
-            else:
-                # TODO: Support the find_and_replace of more than one token
-                part_list = column.split(self.delimiter)
-                for idx, part in enumerate(part_list):
-                    if part in find_and_replace_dict:
-                        part_list[idx] = find_and_replace_dict[part]
-                        return self.delimiter.join(part_list)
-        return column
+        if 'find_and_replace' not in self.features.get('preprocess', {}):
+            return column
+
+        find_and_replace_dict = self.features['preprocess']['find_and_replace']
+        new_column = column
+
+        # 1. Handle Long Keys/Partial Strings (Most likely fix for your JSON)
+        # We sort by length (longest first) so we don't accidentally replace 
+        # a small part of a larger key.
+        sorted_keys = sorted(find_and_replace_dict.keys(), key=len, reverse=True)
+
+        for key in sorted_keys:
+            if key in new_column:
+                new_column = new_column.replace(key, find_and_replace_dict[key])
+
+        # 2. Token-based replacement (as a backup for exact token matches)
+        # This ensures that if 'INCOME' is a key, it only matches 'INCOME' 
+        # and not 'INCOMES'
+        parts = new_column.split(self.delimiter)
+        modified_tokens = False
+        for idx, part in enumerate(parts):
+            if part in find_and_replace_dict:
+                parts[idx] = find_and_replace_dict[part]
+                modified_tokens = True
+
+        if modified_tokens:
+            new_column = self.delimiter.join(parts)
+
+        return new_column
 
     def _generate_stat_vars_from_spec(self):
         """generates stat_var nodes for each column in column list and 
@@ -203,23 +218,16 @@ def _generate_stat_vars_from_spec(self):
         # len((set(self.features['ignoreColumns']) &
         # set(col.split(self.delimiter)) > 0:
         for col in self.column_list:
-            # TODO: Replace the type of ignore_token_count to boolean
-            ignore_token_count = 0
-            for part in col.split(self.delimiter):
-                for token in self.features['ignoreColumns']:
-                    if part == token:
-                        ignore_token_count = 1
-                    if token == col:
-                        ignore_token_count = 1
-
-            # if no tokens of the columns are in ignoreColumns of the spec
-            if ignore_token_count == 0:
+            # Check if any string in ignoreColumns exists within the current header
+            is_ignored = False
+            for ignore_pattern in self.features['ignoreColumns']:
+                if ignore_pattern in col:
+                    is_ignored = True
+                    break
+
+            # If not ignored, proceed to find_and_replace and statvar generation
+            if not is_ignored:
                 renamed_col = self._find_and_replace_column_names(col)
-                # TODO: Before calling the column_to_statvar method,
-                # remove the base class or generalization token in the
-                # column name from the enumSpecialization section of the
-                # spec.
-                # TODO: Should we generate an error _column_to_statvar() returns an empty statvar?
                 self.column_map[col] = self._column_to_statvar(renamed_col)
 
         # TODO: Deprecate this function, since enumSpecialization are used to

diff --git a/scripts/us_census/api_utils/census_api_data_downloader.py b/scripts/us_census/api_utils/census_api_data_downloader.py
@@ -34,10 +34,9 @@
 
 module_dir_ = os.path.dirname(os.path.realpath(__file__))
 path.insert(1, os.path.join(module_dir_, '../../../'))
-
-from .download_utils import download_url_list_iterations
+from download_utils import download_url_list_iterations
 from tools.download_utils.requests_wrappers import request_url_json
-from .status_file_utils import sync_status_list
+from status_file_utils import sync_status_list
-from download_utils import download_url_list_iterations
-from tools.download_utils.requests_wrappers import request_url_json
-from .status_file_utils import sync_status_list
-from status_file_utils import sync_status_list
+from .download_utils import download_url_list_iterations
+from tools.download_utils.requests_wrappers import request_url_json
+from .status_file_utils import sync_status_list
-from download_utils import download_url_list_iterations
-from tools.download_utils.requests_wrappers import request_url_json
-from .status_file_utils import sync_status_list
-from status_file_utils import sync_status_list
+from .download_utils import download_url_list_iterations
+from tools.download_utils.requests_wrappers import request_url_json
+from .status_file_utils import sync_status_list
 
 FLAGS = flags.FLAGS
 
@@ -165,6 +164,7 @@ def download_table(dataset: str,
     url_list = get_table_url_list(dataset, table_id, q_variable, year_list,
                                   output_path, api_key, s_level_list,
                                   force_fetch_config, force_fetch_data)
+    print(url_list)
 
     status_path = os.path.join(output_path, 'download_status.json')
 
@@ -292,7 +292,11 @@ def consolidate_files(dataset: str,
         df = pd.DataFrame()
         for csv_file in csv_files_list[year]:
             cur_csv_path = os.path.join(output_path, csv_file)
-            df2 = pd.read_csv(cur_csv_path, low_memory=False)
+            try:
+                df2 = pd.read_csv(cur_csv_path, low_memory=False)
+            except pd.errors.EmptyDataError:
+                logging.warning('Skipping empty file: %s', cur_csv_path)
+                continue
             print("Collecting", csv_file)
             # remove extra columns
             drop_list = []
@@ -451,8 +455,10 @@ def download_table_variables(dataset, table_id, year_list, geo_url_map_path,
 
 def main(argv):
     year_list_int = list(range(FLAGS.start_year, FLAGS.end_year + 1))
+    print("#########################",year_list_int)
     year_list = [str(y) for y in year_list_int]
     out_path = os.path.expanduser(FLAGS.output_path)
+    print("#####",FLAGS.summary_levels)
     if FLAGS.summary_levels:
         s_list = FLAGS.summary_levels
     else: