Skip to content

Commit f68e335

Browse files
committed
rest of new fn layout. adds CLI
1 parent 3492573 commit f68e335

File tree

5 files changed

+82
-77
lines changed

5 files changed

+82
-77
lines changed

src/acquisition/rvdss/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,9 @@
102102
DASHBOARD_UPDATE_DATE_FILE = "RVD_UpdateDate.csv"
103103
DASHBOARD_DATA_FILE = "RVD_WeeklyData.csv"
104104

105-
RESP_COUNTS_OUTPUT_FILE = "respiratory_detections.csv"
105+
RESP_DETECTIONS_OUTPUT_FILE = "respiratory_detections.csv"
106106
POSITIVE_TESTS_OUTPUT_FILE = "positive_tests.csv"
107+
COUNTS_OUTPUT_FILE = "number_of_detections.csv"
107108

108109
LAST_WEEK_OF_YEAR = 35
109110

src/acquisition/rvdss/pull_current.py

Lines changed: 0 additions & 42 deletions
This file was deleted.

src/acquisition/rvdss/pull_historic.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from delphi.epidata.acquisition.rvdss.constants import (
1818
DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URLS,
1919
ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR,
20-
RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE
20+
RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE
2121
)
2222
from delphi.epidata.acquisition.rvdss.utils import (
2323
abbreviate_virus, abbreviate_geo, create_geo_types, check_date_format,
@@ -548,6 +548,10 @@ def fetch_one_season_from_report(url):
548548
"count": all_number_tables,
549549
}
550550

551+
def fetch_archived_dashboard_urls():
552+
## TODO: paste in Christine's code for scraping this list https://health-infobase.canada.ca/respiratory-virus-detections/archive.html
553+
pass
554+
551555
def fetch_report_data():
552556
# Scrape each season.
553557
dict_list = [fetch_one_season_from_report(url) for url in HISTORIC_SEASON_URLS]
@@ -560,10 +564,6 @@ def fetch_historical_dashboard_data():
560564
dict_list = [{} for url in included_urls]
561565

562566
for i, base_url in enumerate(included_urls):
563-
# Get weekly dashboard data
564-
## TODO: what to do with this "2023"? Need to parse the start year of the season from the URL
565-
## TODO: how to "weekly" and "positive" correspond to the dict keys from historical reports?
566-
dict_list[i]["weekly"] = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
567-
dict_list[i]["positive"] = get_revised_data(base_url)
567+
dict_list[i]["weekly"] = fetch_dashboard_data(url, 2023)
568568

569569
return dict_list

src/acquisition/rvdss/run.py

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,72 @@
1+
"""
2+
Defines command line interface for the rvdss indicator. Current data (covering the most recent epiweek) and historical data (covering all data before the most recent epiweek) can be generated together or separately.
3+
4+
Defines top-level functions to fetch data and save to disk or DB.
5+
"""
6+
17
import pandas as pd
8+
import os
9+
10+
from delphi.epidata.acquisition.rvdss.utils import get_weekly_data, get_revised_data, get_dashboard_update_date
11+
from delphi.epidata.acquisition.rvdss.constants import DASHBOARD_BASE_URL, RESP_DETECTIONS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE, COUNTS_OUTPUT_FILE
12+
13+
14+
def update_current_data():
15+
## TODO: what is the base path for these files?
16+
base_path = "."
217

18+
data_dict = fetch_dashboard_data(DASHBOARD_BASE_URL, 2024)
319

4-
def fetch_archived_dashboard_urls():
5-
## TODO: paste in Christine's code for scraping this list https://health-infobase.canada.ca/respiratory-virus-detections/archive.html
20+
table_types = {
21+
"respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE,
22+
"positive": POSITIVE_TESTS_OUTPUT_FILE,
23+
# "count": COUNTS_OUTPUT_FILE, # Dashboards don't contain this data.
24+
}
25+
for tt in table_types.keys():
26+
data = data_dict[table_types]
627

7-
def fetch_dashboard_data(url = None):
8-
"""Get data from current or archived dashboard"""
9-
pass
28+
# Write the tables to separate csvs
29+
path = base_path + "/" + table_types[tt]
1030

31+
# Since this function generates new data weekly, we need to combine it with the existing data, if it exists.
32+
if not os.path.exists(path):
33+
data.to_csv(path,index=True)
34+
else:
35+
old_data = pd.read_csv(path).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
1136

12-
def fetch_current_dashboard_data():
13-
return fetch_dashboard_data(DEFAULT_DASHBOARD_URL)
37+
# If index already exists in the data on disk, don't add the new data -- we may have already run the weekly data fetch.
38+
## TODO: The check on index maybe should be stricter? Although we do deduplication upstream, so this probably won't find true duplicates
39+
if not data.index.isin(old_data.index).any():
40+
old_data= pd.concat([old_data,data],axis=0)
41+
old_data.to_csv(path,index=True)
42+
43+
# ## TODO
44+
# update_database(data)
1445

15-
def update_current_data(start_date, end_date):
16-
data = fetch_current_dashboard_data()
17-
update_database(data)
1846

1947
def update_historical_data():
48+
## TODO: what is the base path for these files?
49+
base_path = "."
50+
2051
report_dict_list = fetch_report_data()
2152
dashboard_dict_list = fetch_historical_dashboard_data()
2253

23-
table_types = (
24-
"respiratory_detection",
25-
"positive",
26-
"count",
27-
)
28-
for tt in table_types:
29-
## TODO: need to merge tables together from dashboards and reports. Expect 3 tables out.
30-
pass
31-
# ??
54+
table_types = {
55+
"respiratory_detection": RESP_DETECTIONS_OUTPUT_FILE,
56+
"positive": POSITIVE_TESTS_OUTPUT_FILE,
57+
"count": COUNTS_OUTPUT_FILE,
58+
}
59+
for tt in table_types.keys():
60+
# Merge tables together from dashboards and reports for each table type.
61+
dashboard_data = [elem.get(tt, None) for elem in dashboard_dict_list]
62+
report_data = [elem.get(tt, None) for elem in report_dict_list]
3263
data = [report_data, dashboard_data].concat()
3364

34-
# Write the three tables to separate csvs
35-
all_respiratory_detection_tables.to_csv(path+"/" + RESP_COUNTS_OUTPUT_FILE, index=True)
36-
all_positive_tables.to_csv(path+"/" + POSITIVE_TESTS_OUTPUT_FILE, index=True)
37-
38-
# Write the number of detections table to csv if it exists (i.e has rows)
39-
if len(all_number_tables) != 0:
40-
all_number_tables.to_csv(path+"/number_of_detections.csv", index=True)
65+
# Write the tables to separate csvs
66+
data.to_csv(base_path +"/" + table_types[tt], index=True)
4167

42-
update_database(data)
68+
# ## TODO
69+
# update_database(data)
4370

4471

4572
def main():

src/acquisition/rvdss/utils.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def get_revised_data(base_url,headers,update_date):
9494

9595
return(df)
9696

97+
## TODO: the `start_year` arg is making calling this complicated. If we know that LAST_WEEK_OF_YEAR (really, of the season) is always 35, then we should be able to derive `start_year` from `update_date`.
9798
def get_weekly_data(base_url,start_year,headers,update_date):
9899
# Get current week and year
99100
summary_url = base_url + "RVD_SummaryText.csv"
@@ -137,4 +138,22 @@ def get_weekly_data(base_url,start_year,headers,update_date):
137138
# if df_weekly.columns.isin(["weekorder","date","week"]).all():
138139
# df_weekly=df_weekly.drop(["weekorder","date","week"],axis=1)
139140

140-
return(df_weekly)
141+
return(df_weekly.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value']))
142+
143+
def fetch_dashboard_data(url, start_year):
144+
"""Get data from current or archived dashboard"""
145+
headers = {
146+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
147+
}
148+
149+
update_date = get_dashboard_update_date(url, headers)
150+
151+
weekly_data = get_weekly_data(url,start_year,headers,update_date)
152+
positive_data = get_revised_data(url,headers,update_date)
153+
154+
## TODO: how to "weekly" and "positive" correspond to the dict keys ("respiratory_detection", "positive", "count") from historical reports? Need to make sure keys used here are from the same set.
155+
return {
156+
"respiratory_detection": weekly_data, ## TODO: ?
157+
"positive": positive_data,
158+
# "count": None, # Dashboards don't contain this data.
159+
}

0 commit comments

Comments
 (0)