rest of new fn layout. adds CLI

nmdefries · nmdefries · commit f68e3359fb64 · 2024-10-15T13:42:55.000-04:00
diff --git a/src/acquisition/rvdss/constants.py b/src/acquisition/rvdss/constants.py
@@ -102,8 +102,9 @@
 DASHBOARD_UPDATE_DATE_FILE = "RVD_UpdateDate.csv"
 DASHBOARD_DATA_FILE = "RVD_WeeklyData.csv"
 
-RESP_COUNTS_OUTPUT_FILE = "respiratory_detections.csv"
+RESP_DETECTIONS_OUTPUT_FILE = "respiratory_detections.csv"
 POSITIVE_TESTS_OUTPUT_FILE = "positive_tests.csv"
+COUNTS_OUTPUT_FILE = "number_of_detections.csv"
 
 LAST_WEEK_OF_YEAR = 35
 
diff --git a/src/acquisition/rvdss/pull_current.py b/src/acquisition/rvdss/pull_current.py
diff --git a/src/acquisition/rvdss/pull_historic.py b/src/acquisition/rvdss/pull_historic.py
@@ -17,7 +17,7 @@
 from delphi.epidata.acquisition.rvdss.constants import (
         DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URLS,
         ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR,
-        RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE
+        RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE
     )
 from delphi.epidata.acquisition.rvdss.utils import (
         abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format,
@@ -548,6 +548,10 @@ def fetch_one_season_from_report(url):
         "count": all_number_tables,
     }
 
+def fetch_archived_dashboard_urls():
+    ## TODO: paste in Christine's code for scraping this list https://health-infobase.canada.ca/respiratory-virus-detections/archive.html
+    pass
+
 def fetch_report_data():
     # Scrape each season.
     dict_list = [fetch_one_season_from_report(url) for url in HISTORIC_SEASON_URLS]
@@ -560,10 +564,6 @@ def fetch_historical_dashboard_data():
     dict_list = [{} for url in included_urls]
 
     for i, base_url in enumerate(included_urls):
-        # Get weekly dashboard data
-        ## TODO: what to do with this "2023"? Need to parse the start year of the season from the URL
-        ## TODO: how to "weekly" and "positive" correspond to the dict keys from historical reports?
-        dict_list[i]["weekly"] = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
-        dict_list[i]["positive"] = get_revised_data(base_url)
+        dict_list[i]["weekly"] = fetch_dashboard_data(url, 2023)
 
     return dict_list
diff --git a/src/acquisition/rvdss/run.py b/src/acquisition/rvdss/run.py
@@ -1,45 +1,72 @@
+"""
+Defines command line interface for the rvdss indicator. Current data (covering the most recent epiweek) and historical data (covering all data before the most recent epiweek) can be generated together or separately.
+
+Defines top-level functions to fetch data and save to disk or DB.
+"""
+
 import pandas as pd
+import os
+
+from delphi.epidata.acquisition.rvdss.utils import get_weekly_data, get_revised_data, get_dashboard_update_date
+from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL, RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE, COUNTS_OUTPUT_FILE
+
+
+def update_current_data():
+    ## TODO: what is the base path for these files?
+    base_path = "."
 
+    data_dict = fetch_dashboard_data(DASHBOARD_BASE_URL, 2024)
 
-def fetch_archived_dashboard_urls():
-    ## TODO: paste in Christine's code for scraping this list https://health-infobase.canada.ca/respiratory-virus-detections/archive.html
+    table_types = {
+        "respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE,
+        "positive": POSITIVE_TESTS_OUTPUT_FILE,
+        # "count": COUNTS_OUTPUT_FILE, # Dashboards don't contain this data.
+    }
+    for tt in table_types.keys():
+        data = data_dict[table_types]
 
-def fetch_dashboard_data(url = None):
-    """Get data from current or archived dashboard"""
-    pass
+        # Write the tables to separate csvs
+        path = base_path + "/" + table_types[tt]
 
+        # Since this function generates new data weekly, we need to combine it with the existing data, if it exists.
+        if not os.path.exists(path):
+            data.to_csv(path,index=True)
+        else:
+            old_data = pd.read_csv(path).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
 
-def fetch_current_dashboard_data():
-    return fetch_dashboard_data(DEFAULT_DASHBOARD_URL)
+            # If index already exists in the data on disk, don't add the new data -- we may have already run the weekly data fetch.
+            ## TODO: The check on index maybe should be stricter? Although we do deduplication upstream, so this probably won't find true duplicates
+            if not data.index.isin(old_data.index).any():
+                old_data= pd.concat([old_data,data],axis=0)
+                old_data.to_csv(path,index=True)
+
+        # ## TODO
+        # update_database(data)
 
-def update_current_data(start_date, end_date):
-    data = fetch_current_dashboard_data()
-    update_database(data)
 
 def update_historical_data():
+    ## TODO: what is the base path for these files?
+    base_path = "."
+
     report_dict_list = fetch_report_data()
     dashboard_dict_list = fetch_historical_dashboard_data()
 
-    table_types = (
-        "respiratory_detection",
-        "positive",
-        "count",
-    )
-    for tt in table_types:
-        ## TODO: need to merge tables together from dashboards and reports. Expect 3 tables out.
-        pass
-        # ??
+    table_types = {
+        "respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE,
+        "positive": POSITIVE_TESTS_OUTPUT_FILE,
+        "count": COUNTS_OUTPUT_FILE,
+    }
+    for tt in table_types.keys():
+        # Merge tables together from dashboards and reports for each table type.
+        dashboard_data = [elem.get(tt, None) for elem in dashboard_dict_list]
+        report_data = [elem.get(tt, None) for elem in report_dict_list]
         data = [report_data, dashboard_data].concat()
 
-    # Write the three tables to separate csvs
-    all_respiratory_detection_tables.to_csv(path+"/" + RESP_COUNTS_OUTPUT_FILE, index=True)
-    all_positive_tables.to_csv(path+"/" + POSITIVE_TESTS_OUTPUT_FILE, index=True)
-
-    # Write the number of detections table to csv if it exists (i.e has rows)
-    if len(all_number_tables) != 0:
-        all_number_tables.to_csv(path+"/number_of_detections.csv", index=True)
+        # Write the tables to separate csvs
+        data.to_csv(base_path +"/" + table_types[tt], index=True)
 
-    update_database(data)
+        # ## TODO
+        # update_database(data)
 
 
 def main():
diff --git a/src/acquisition/rvdss/utils.py b/src/acquisition/rvdss/utils.py
@@ -94,6 +94,7 @@ def get_revised_data(base_url,headers,update_date):
     
     return(df)
     
+## TODO: the `start_year` arg is making calling this complicated. If we know that LAST_WEEK_OF_YEAR (really, of the season) is always 35, then we should be able to derive `start_year` from `update_date`.
 def get_weekly_data(base_url,start_year,headers,update_date):
     # Get current week and year
     summary_url =  base_url + "RVD_SummaryText.csv"
@@ -137,4 +138,22 @@ def get_weekly_data(base_url,start_year,headers,update_date):
    # if df_weekly.columns.isin(["weekorder","date","week"]).all():
     #    df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1)
 
-    return(df_weekly)
+    return(df_weekly.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']))
+
+def fetch_dashboard_data(url, start_year):
+    """Get data from current or archived dashboard"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
+    }
+
+    update_date = get_dashboard_update_date(url, headers)
+
+    weekly_data = get_weekly_data(url,start_year,headers,update_date)
+    positive_data = get_revised_data(url,headers,update_date)
+
+    ## TODO: how to "weekly" and "positive" correspond to the dict keys ("respiratory_detection", "positive", "count") from historical reports? Need to make sure keys used here are from the same set.
+    return {
+        "respiratory_detection": weekly_data, ## TODO: ?
+        "positive": positive_data,
+        # "count": None, # Dashboards don't contain this data.
+    }