psrc · stefancoe · Dec 8, 2025 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/census_getter/__init__.py b/census_getter/__init__.py
@@ -5,4 +5,5 @@
 __doc__ = 'census_getter'
 __all__ = ['setup_data_structures']
 
-from . import util
+from . import util
+from . import census_helpers
diff --git a/census_getter/census_helpers.py b/census_getter/census_helpers.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2016, UrbanSim Inc. All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software without
+# specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import census
+import pandas as pd
+import numpy as np
+import us
+import requests
+# code to retry when census api fails
+sess = requests.Session()
+adapter = requests.adapters.HTTPAdapter(max_retries=100)
+sess.mount('https://', adapter)
+
+# TODO DOCSTRING!!
+class Census:
+
+    def __init__(self, key, acsyear=2016):
+        self.c = census.Census(key, session=sess)
+        self.base_url = synthpop_config(acsyear).pums_storage()
+        self.support_files = geog_changes_path(acsyear).geog_change_storage()
+        self.acsyear_files = acsyear
+        self.pums_relationship_file_url = self.support_files + "tract10_to_puma.csv"
+        self.pums_relationship_df = None
+        self.pums10_population_base_url = \
+            self.base_url + "puma10_p_%s_%s.csv"
+        self.pums10_household_base_url = \
+            self.base_url + "puma10_h_%s_%s.csv"
+        self.pums00_population_base_url = \
+            self.base_url + "puma00_p_%s_%s.csv"
+        self.pums00_household_base_url = \
+            self.base_url + "puma00_h_%s_%s.csv"
+        self.pums_population_state_base_url = \
+            self.base_url + "puma_p_%s.csv"
+        self.pums_household_state_base_url = \
+            self.base_url + "puma_h_%s.csv"
+        self.fips_url = self.base_url + "national_county.txt"
+        self.fips_df = None
+        self.pums_cache = {}
+
+    # df1 is the disaggregate data frame (e.g. block groups)
+    # df2 is the aggregate data frame (e.g. tracts)
+    # need to scale down df2 variables to df1 level totals
+    def _scale_and_merge(self, df1, tot1, df2, tot2, columns_to_scale,
+                         merge_columns, suffixes):
+        df = pd.merge(df1, df2, left_on=merge_columns, right_on=merge_columns,
+                      suffixes=suffixes)
+
+        # going to scale these too so store current values
+        tot2, tot1 = df[tot2], df[tot1]
+        # if agg number if 0, disaggregate should be 0
+        # note this is filled by fillna below
+        assert np.all(tot1[tot2 == 0] == 0)
+
+        for col in columns_to_scale:
+            df[col] = df[col] / tot2 * tot1
+            # round?
+            df[col] = df[col].fillna(0).astype('int')
+        return df
+
+    def block_group_query(self, census_columns, state, county, year=2016,
+                          tract=None, id=None):
+        if id is None:
+            id = "*"
+        return self._query(census_columns, state, county,
+                           forstr="block group:%s" % id,
+                           tract=tract, year=year)
+
+    def tract_query(self, census_columns, state, county, year=2016,
+                    tract=None):
+        if tract is None:
+            tract = "*"
+        return self._query(census_columns, state, county,
+                           forstr="tract:%s" % tract,
+                           year=year)
+
+    def _query(self, census_columns, state, county, forstr,
+               year, tract=None):
+        c = self.c
+
+        #state, county = self.try_fips_lookup(state, county)
+
+        if tract is None:
+            in_str = 'state:%s county:%s' % (state, county)
+        else:
+            in_str = 'state:%s county:%s tract:%s' % (state, county, tract)
+
+        dfs = []
+
+        # unfortunately the api only queries 50 columns at a time
+        # leave room for a few extra id columns
+        def chunks(l, n):
+            """ Yield successive n-sized chunks from l.
+            """
+            for i in range(0, len(l), n):
+                yield l[i:i+n]
+
+        for census_column_batch in chunks(census_columns, 45):
+            census_column_batch = list(census_column_batch)
+            d = c.acs5.get(['NAME'] + census_column_batch,
+                           geo={'for': forstr,
+                                'in': in_str}, year=year)
+            df = pd.DataFrame(d)
+            df[census_column_batch] = df[census_column_batch].astype('int')
+            dfs.append(df)
+
+        assert len(dfs) >= 1
+        df = dfs[0]
+        for mdf in dfs[1:]:
+            df = pd.merge(df, mdf, on="NAME", suffixes=("", "_ignore"))
+            drop_cols = list(filter(lambda x: "_ignore" in x, df.columns))
+            df = df.drop(drop_cols, axis=1)
+
+        return df
+
+    def block_group_and_tract_query(self, block_group_columns,
+                                    tract_columns, state, county,
+                                    merge_columns, block_group_size_attr,
+                                    tract_size_attr, year=2016, tract=None):
+        df2 = self.tract_query(tract_columns, state, county, tract=tract,
+                               year=year)
+        df1 = self.block_group_query(block_group_columns, state, county,
+                                     tract=tract, year=year)
+
+        df = self._scale_and_merge(df1, block_group_size_attr, df2,
+                                   tract_size_attr, tract_columns,
+                                   merge_columns, suffixes=("", "_ignore"))
+        drop_cols = list(filter(lambda x: "_ignore" in x, df.columns))
+        df = df.drop(drop_cols, axis=1)
+
+        return df
+
+    def update_geographies(self, df):
+        acsyear = self.acsyear_files
+        changes = pd.read_csv(self.support_files + 'geog_changes.csv',
+                              dtype={'new_geog': 'str', 'old_geog': 'str'})
+        for year in range(2011, acsyear):
+            year_change = changes[changes['year'] == year].copy()
+            import pdb
+            if len(year_change) > 0:
+                for index, row in year_change.iterrows():
+                    new = row['new_geog']
+                    old = row['old_geog']
+                    state_new = new[:2]
+                    state_old = old[:2]
+                    county_new = new[2:5]
+                    county_old = old[2:5]
+                    if len(new) > 5:
+                        tract_new = new[5:]
+                        tract_old = old[5:]
+                        idx = df.index.max() + 1
+                        df.loc[idx, 'statefp'] = state_new
+                        df.loc[idx, 'countyfp'] = county_new
+                        df.loc[idx, 'tractce'] = tract_new
+                        old_puma10 = df[(df['statefp'] == state_old) &
+                                        (df['countyfp'] == county_old) &
+                                        (df['tractce'] == tract_old)]['puma10_id'].values[0]
+                        old_puma00 = df[(df['statefp'] == state_old) &
+                                        (df['countyfp'] == county_old) &
+                                        (df['tractce'] == tract_old)]['puma00_id'].values[0]
+                        df.loc[idx, 'puma10_id'] = old_puma10
+                        df.loc[idx, 'puma00_id'] = old_puma00
+                    else:
+                        df_change = df[(df['statefp'] == state_old) &
+                                       (df['countyfp'] == county_old)].copy()
+                        df_change.loc[:, 'countyfp'] = county_new
+                        df = pd.concat([df, df_change])
+        return df
+
+    def _get_pums_relationship(self):
+        if self.pums_relationship_df is None:
+            self.pums_relationship_df = \
+                pd.read_csv(self.pums_relationship_file_url, dtype={
+                    "statefp": "object",
+                    "countyfp": "object",
+                    "tractce": "object",
+                    "puma10_id": "object",
+                    "puma00_id": "object",
+                })
+            self.pums_relationship_df = self.update_geographies(self.pums_relationship_df)
+        return self.pums_relationship_df
+
+    def _get_fips_lookup(self):
+        if self.fips_df is None:
+            self.fips_df = pd.read_csv(
+                self.fips_url,
+                dtype={
+                    "State ANSI": "object",
+                    "County ANSI": "object"
+                },
+                index_col=["State",
+                           "County Name"]
+            )
+            del self.fips_df["ANSI Cl"]
+        return self.fips_df
+
+    def tract_to_puma(self, state, county, tract):
+
+        state, county = self.try_fips_lookup(state, county)
+
+        df = self._get_pums_relationship()
+        q = "statefp == '%s' and countyfp == '%s' and tractce == '%s'" % (state, county, tract)
+        r = df.query(q)
+        return r["puma10_id"].values[0], r["puma00_id"].values[0]
+
+    def _read_csv(self, loc, **kargs):
+        if loc not in self.pums_cache:
+            pums_df = pd.read_csv(loc, dtype={
+                "PUMA10": "object",
+                "PUMA00": "object",
+                "ST": "object",
+                "SERIALNO": 'str',
+                "serialno": 'str',
+            }, **kargs)
+            pums_df = pums_df.rename(columns={
+                'PUMA10': 'puma10',
+                'PUMA00': 'puma00',
+                'SERIALNO': 'serialno'
+            })
+            self.pums_cache[loc] = pums_df
+        return self.pums_cache[loc]
+
+    def download_population_pums(self, state, puma10=None, puma00=None, **kargs):
+        state = self.try_fips_lookup(state)
+        if (puma10 is None) & (puma00 is None):
+            return self._read_csv(self.pums_population_state_base_url % (state), **kargs)
+        pums = self._read_csv(self.pums10_population_base_url % (state, puma10), **kargs)
+        if (puma00 is not None) & (self.acsyear_files < 2018):
+            pums00 = self._read_csv(self.pums00_population_base_url % (state, puma00), **kargs)
+            pums = pd.concat([pums, pums00], ignore_index=True)
+        return pums
+
+    def download_household_pums(self, state, puma10=None, puma00=None, **kargs):
+        state = self.try_fips_lookup(state)
+        if (puma10 is None) & (puma00 is None):
+            return self._read_csv(self.pums_household_state_base_url % (state), **kargs)
+        pums = self._read_csv(self.pums10_household_base_url % (state, puma10), **kargs)
+        if (puma00 is not None) & (self.acsyear_files < 2018):
+            pums00 = self._read_csv(self.pums00_household_base_url % (state, puma00), **kargs)
+            pums = pd.concat([pums, pums00], ignore_index=True)
+
+        # filter out gq and empty units (non-hh records)
+        pums = pums[(pums.RT == 'H') & (pums.NP > 0) & (pums.TYPE == 1)]
+
+        return pums
+
+    def try_fips_lookup(self, state, county=None):
+        df = self._get_fips_lookup()
+
+        if county is None:
+            try:
+                return getattr(us.states, state).fips
+            except:
+                pass
+            return state
+
+        try:
+            return df.loc[(state, county)]
+        except:
+            pass
+        return state, county
+
+class synthpop_config:
+
+    def __init__(self, acsyear=2013):
+        self.acsyear = acsyear
+
+    def pums_storage(self):
+        if self.acsyear >= 2018:
+            storage = "https://storage.googleapis.com/synthpop-public/PUMS2018/pums_2018_acs5/"
+        else:
+            storage = "https://s3-us-west-1.amazonaws.com/synthpop-data2/"
+        return storage
+
+    def __call__(self):
+        return self.pums_storage()
+
+
+class geog_changes_path:
+    def __init__(self, acsyear):
+        self.acsyear = acsyear
+
+    def geog_change_storage(self):
+        storage = "https://storage.googleapis.com/synthpop-public/support_files/"
+        return storage
+
+    def __call__(self):
+        return self.geog_change_storage()
diff --git a/census_getter/steps/apply_acs_shares.py b/census_getter/steps/apply_acs_shares.py
@@ -12,7 +12,7 @@
 
 from .. util import setting, create_block_group_id, create_full_block_group_id
 #from census_getter.util import create_block_group_id
-from synthpop.census_helpers import Census
+from ..census_helpers import Census
 
 logger = logging.getLogger(__name__)
 

diff --git a/census_getter/steps/get_acs_data.py b/census_getter/steps/get_acs_data.py
@@ -10,7 +10,7 @@
 from activitysim.core import pipeline
 
 from .. util import setting, create_full_block_group_id, create_block_group_id
-from synthpop.census_helpers import Census
+from .. census_helpers import Census
 
 logger = logging.getLogger(__name__)
 

diff --git a/census_getter/steps/prepare_pums.py b/census_getter/steps/prepare_pums.py
@@ -10,7 +10,7 @@
 from activitysim.core import assign
 
 from .. util import setting, create_block_group_id
-from synthpop.census_helpers import Census
+from ..census_helpers import Census
 
 logger = logging.getLogger(__name__)
 

diff --git a/census_getter/util.py b/census_getter/util.py
@@ -83,4 +83,4 @@ def data_dir_from_settings():
         data_dir = inject.get_injectable('data_dir')
 
     logger.info("data_dir: %s" % data_dir)
-    return data_dir
+    return data_dir
diff --git a/...src/configs/census_getter_expressions.csv → ...src/configs/census_getter_expressions.csv b/...src/configs/census_getter_expressions.csv → ...src/configs/census_getter_expressions.csv
diff --git a/example_psrc/configs/environment.yml → examples/psrc/configs/environment.yml b/example_psrc/configs/environment.yml → examples/psrc/configs/environment.yml
diff --git a/example_psrc/configs/logging.yaml → examples/psrc/configs/logging.yaml b/example_psrc/configs/logging.yaml → examples/psrc/configs/logging.yaml
diff --git a/example_psrc/configs/ofm_shares.csv → examples/psrc/configs/ofm_shares.csv b/example_psrc/configs/ofm_shares.csv → examples/psrc/configs/ofm_shares.csv
diff --git a/example_psrc/configs/settings.yaml → examples/psrc/configs/settings.yaml b/example_psrc/configs/settings.yaml → examples/psrc/configs/settings.yaml
diff --git a/example_psrc/data/hh_size_control_totals.csv → ...ples/psrc/data/hh_size_control_totals.csv b/example_psrc/data/hh_size_control_totals.csv → ...ples/psrc/data/hh_size_control_totals.csv
diff --git a/example_psrc/data/ofm_control_totals.csv → examples/psrc/data/ofm_control_totals.csv b/example_psrc/data/ofm_control_totals.csv → examples/psrc/data/ofm_control_totals.csv
diff --git a/example_psrc/data/puma_geog_lookup.csv → examples/psrc/data/puma_geog_lookup.csv b/example_psrc/data/puma_geog_lookup.csv → examples/psrc/data/puma_geog_lookup.csv
diff --git a/example_psrc/output/.gitignore → examples/psrc/output/.gitignore b/example_psrc/output/.gitignore → examples/psrc/output/.gitignore
diff --git a/example_psrc/run_census_getter.py → examples/psrc/run_census_getter.py b/example_psrc/run_census_getter.py → examples/psrc/run_census_getter.py
@@ -1,6 +1,6 @@
 import os
 import logging
-os.chdir(r'E:\census_getter\census_getter\example_psrc')
+#os.chdir(r'E:\census_getter\census_getter\example_psrc')
 working_dir = os.getcwd()
 parent_dir = os.path.abspath(os.path.join(working_dir, os.pardir))
 os.sys.path.append(os.path.join(parent_dir))

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "census-getter"
+version = "0.0.1"
+description = "ACS Data to Pandas Dataframes"
+authors = [
+    {name = "PSRC", email = "[email protected]"}
+]
+license = {text = "MIT"}
+readme = "README.md"
+requires-python = ">=3.10, <3.12"
+dependencies = [
+    "activitysim==1.2.1",
+    "census",
+    "numpy==1.23.4",
+    "setuptools",
+    "us"
+]
+
+[project.scripts]
+census-getter = "census_getter.cli.main:main"
+
+[tool.uv] # <--- Added the correct table header
+package = true # <--- Now correctly scoped under [tool.uv]
+
+