-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAdams_Project_Data_Explained.R
189 lines (155 loc) · 6.61 KB
/
Adams_Project_Data_Explained.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#Since I'm loading many of my data sources from the web, rather than
#files bundled with the app code, I decided it made sense to include this
#script. The code here loads in all required data sets and lists the
#transformations or modifications made to each.
library(tidyverse)
library(vroom)
# COVID-19 Case/Death Data --------------------------------------------
#Data Source: NY Times COVID-19 GitHub
covid <-
vroom(
'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
) %>%
# Pivot and rename the data
pivot_longer('cases':'deaths',
names_to = 'metric',
values_to = 'n')
#The `vroom` package from the tidyverse makes it possible to read in large
#files very quickly. From there, I apply the same transformations used in
#previous versions of this app; convert cases and deaths to `metric` and
#values to `n`. The other columns are "fips" (my primary key across
#different data sets), county name, and date.
# Public Health Department Social Media Data --------------------------
# Read in my project data from my project GitHub
tw_fb_data <-
vroom(
'https://raw.githubusercontent.com/aadams149/ppol683_fall2021_project/main/data/raw/counties_with_tweets.csv'
) %>%
#Change the fips column to character so it plays nice with the other data sets
mutate(fips =
as.character(fips)) %>%
#Add leading zeroes where necessary
mutate(
fips =
str_pad(fips,
5,
pad = '0'),
#Create a column with county names and state abbreviations together
#(This is just for aesthetic benefit later.)
place =
paste0(name, ", ", state_full),
#Create a column recording the presence of a facebook and/or twitter
#(I ran into some issues trying to get tmap/leaflet to play nice
#with multiple polygon sets, so this ended up being an adequate
#alternative solution)
socmed = case_when(
facebookYN == 1 & twitterYN == 1 ~ "Both Facebook and Twitter",
facebookYN == 0 &
twitterYN == 0 ~ "No Facebook or Twitter",
facebookYN == 1 &
twitterYN == 0 ~ "Facebook but no Twitter",
facebookYN == 0 &
twitterYN == 1 ~ "Twitter but no Facebook"
)
)
#Relevant columns:
# -fips: primary key
# -name: county name
# -state: state abbreviation
# -state_full: full state name
# -place: [name, state_full] (used for map labels)
# -twitterYN: Is there a twitter for this county's health department
# -facebookYN: Is there a facebook for the department
# -twitter_active_COVID: was twitter active post-declaration of COVID as a pandemic
# -twitter_active_last60: was twitter active w/in last 60 days (defined relative to 2021-12-01)
# (the most recent day I scraped the accounts)
# -date: the date of the most recent retrievable tweet (note: not all accounts had
# retrievable tweets)
# -tweet: the content of the tweet
# -link: the link to the tweet
# -likes_count/retweets_count/replies_count: # of that kind of engagement metric on the retrieved tweet
# -socmed: used for map labeling, constructed from twitterYN and facebookYN
# Race Data -----------------------------------------------------------
#Source: U.S. Census Bureau, 2020 U.S. Census
racedata <-
vroom('data/raw/county_race_data.csv') %>%
select(id,
Total_Hispanic:Total_Multiracial) %>%
mutate(fips =
str_sub(id,-5,-1)) %>%
select(!id)
#Relevant columns:
# -fips: primary key
# -Total_[Racial Group]: # of people identifying as that racial group in the
# county
#I divide each racial column by the population variable listed below to
#generate a proportion, which is the main racial attribute used in my code.
# Population Data -----------------------------------------------------
#Source: Provided in class/U.S. Census Bureau, 2020 U.S. Census
population_counties <-
vroom('data/raw/co-est2020.csv') %>%
#Exclude state total populations
filter(COUNTY != '000') %>%
#Rename population for convenience
rename('population' = POPESTIMATE2020) %>%
#Create a fips column for easy merging
mutate('fips' = paste0(STATE,COUNTY)) %>%
select(fips, STNAME, population)
#Relevant columns:
# -fips: primary key
# -STNAME: state name (made redundant by state_full above)
# -population: county population as of the 2020 census
# Vaccination Data ----------------------------------------------------
#Source: Centers for Disease Control and Prevention
#Read in vaccination data from CDC API
#I'm only incorporating the most recent vax numbers in this project
#This data set updates each day, same as the COVID-19 one from the NYTimes
#Rows are imported in reverse chronological order
#I use the system date to check if there are at least 3143 rows where the date
#is today (the number of counties in the US). If so, subset the data to
#rows from today. Otherwise, subtract 1 from the date and subset to that date
#(yesterday). This way, no matter when the app is run, it should have
#1 vaccine data observation for each county that is no more than 24 hours old
vax_data <-
vroom('https://data.cdc.gov/resource/8xkx-amqh.csv?$limit=7000')
today <- Sys.Date()
if (length(vax_data[vax_data$date == today,]) >= 3143){
vax_data = vax_data %>%
filter(date == today)
}else{
vax_data = vax_data %>%
filter(date == today-1)
}
vax_data <-
vax_data %>%
select(fips,
date,
series_complete_pop_pct,
administered_dose1_pop_pct
)
#Relevant columns:
# -fips: primary key
# -date: the date of the data
# -series_complete_pop_pct: the percent of the county which has received
# a full vaccination sequence
# -administered_dose1_pop_pct: the percent of the county which has received
# at least one COVID-19 vaccine dose
#There are other variables included in this data set which break down rates
#and proportions by different age groups. I'm not sure if I'll have time to
#fully incorporate that into my project, especially since I don't have
#age data for any other metrics.
#I'm considering implementing more than the most recent day's worth of data
#for this set of variables, but there are limitations to what I can
#retrieve using the API like this.
# Shapefile -----------------------------------------------------------
#Shapefile of U.S. counties. Original CRS is 5070 (NAD83)
us_counties <-
st_read('data/spatial/us_counties.shp') %>%
#Rename GEOID to fips for clarity
rename(fips = GEOID) %>%
#Transform to 4326 so it plays nice w/ tmap
st_transform(crs = 4326) %>%
#Maybe this left join is unnecessary.
left_join(population_counties,
by = 'fips')
#GEOID is equivalent to fips, and renaming it makes matching easier.