-
Notifications
You must be signed in to change notification settings - Fork 15
/
prep CACI vulnerability data.r
246 lines (183 loc) · 8.38 KB
/
prep CACI vulnerability data.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
##
## Prepare/analyse data on vulnerabilities provided by CACI (caci.co.uk) - we can't share the underlying data publicly, though
##
library(tidyverse)
library(readxl)
library(janitor)
library(GGally)
source("functions.r")
data_dir = "C:/Users/040026704/Documents/Data science/Data/CACI"
##
## load data (that we can't share publicly)
##
# people living alone
single_occ_houses = read_csv(file.path(data_dir, "CCI_British Red Cross - UK Postcodes with Counts of Single Occupancy Households - 2020.csv"), col_types = "ci")
single_occ_houses = single_occ_houses %>% filter(!is.na(`Household_size_:_1_person`))
# vulnerability indicators - deciles
vuln = read_csv(file.path(data_dir, "CCI_British Red Cross - UK Postcodes with Vunerability Indicators.csv"))
# vulnerability indicators - raw scores
vuln_raw = read_csv(file.path(data_dir, "vul_zscores.csv"))
wellbeing = read_csv(file.path(data_dir, "CCI_British Red Cross - Wellbeing Acorn Directory UK Formatted.csv"), col_types = "clliic")
##
## load data (that can be shared publicly)
##
postcodes = read_csv("data/postcodes/Data/ONSPD_FEB_2020_UK.csv")
# the ONS data truncates 7-character postcodes to remove spaces (e.g. CM99 1AB --> CM991AB); get rid of all spaces in both datasets to allow merging
postcodes$Postcode2 = gsub(" ", "", postcodes$pcd)
single_occ_houses$Postcode2 = gsub(" ", "", single_occ_houses$Postcode)
vuln$Postcode2 = gsub(" ", "", vuln$Postcode)
vuln_raw$Postcode2 = gsub(" ", "", vuln_raw$postcode)
wellbeing$Postcode2 = gsub(" ", "", wellbeing$Postcode)
# make postcode to LSOA/MSOA lookup tables
pc_msoa = postcodes %>% select(Postcode2, msoa11)
pc_lsoa = postcodes %>% select(Postcode2, lsoa11)
pc_ward = postcodes %>% select(Postcode2, osward)
###############################################################################
## Living alone
##
##
## calculate numbers of people living alone in MSOAs
##
living_alone_lsoa = single_occ_houses %>%
left_join(pc_lsoa, by="Postcode2") %>%
rename(LSOA11CD = lsoa11) %>%
group_by(LSOA11CD) %>%
summarise(`No. people living alone` = sum(`Household_size_:_1_person`, na.rm = TRUE))
living_alone_msoa = single_occ_houses %>%
left_join(pc_msoa, by="Postcode2") %>%
rename(MSOA11CD = msoa11) %>%
group_by(MSOA11CD) %>%
summarise(`No. people living alone` = sum(`Household_size_:_1_person`, na.rm = TRUE))
living_alone_ward = single_occ_houses %>%
left_join(pc_ward, by="Postcode2") %>%
rename(WD20CD = osward) %>%
group_by(WD20CD) %>%
summarise(`No. people living alone` = sum(`Household_size_:_1_person`, na.rm = TRUE))
write_csv(living_alone_lsoa, "data/CACI/living-alone-lsoa.csv")
write_csv(living_alone_msoa, "data/CACI/living-alone-msoa.csv")
write_csv(living_alone_ward, "data/CACI/living-alone-ward.csv")
###############################################################################
## Digital vulnerability
##
##
## Aggregate postcode-level vulnerability into LSOAs and MSOAs by calculating the 20% most vulnerable postcodes in each LSOA/MSOA based on the deciles
##
vuln_msoa = vuln %>%
left_join(pc_msoa, by="Postcode2") %>%
rename(MSOA11CD = msoa11) %>%
# run this code to see what the distributions of deciles within some random MSOAs look like
# filter(MSOA11CD %in% sample(pc_msoa$msoa11, 4)) %>%
# ggplot(aes(x = `Digital_combined_-_Decile`)) +
# geom_histogram(binwidth = 1) +
# facet_wrap(~ MSOA11CD) %>%
# group_by(MSOA11CD) %>%
# summarise_if(is.numeric, median, na.rm = TRUE) %>%
mutate(Top20 = ifelse(`Digital_combined_-_Decile` <= 2, "Top20", "Other")) %>%
tabyl(MSOA11CD, Top20) %>%
mutate(`Proportion of postcodes in 20% most digitally excluded` = Top20 / (Top20 + Other))
# do the same for LSOAs
vuln_lsoa = vuln %>%
left_join(pc_lsoa, by="Postcode2") %>%
rename(LSOA11CD = lsoa11) %>%
# group_by(LSOA11CD) %>%
# summarise_if(is.numeric, median, na.rm = TRUE) %>%
mutate(Top20 = ifelse(`Digital_combined_-_Decile` <= 2, "Top20", "Other")) %>%
tabyl(LSOA11CD, Top20) %>%
mutate(`Proportion of postcodes in 20% most digitally excluded` = Top20 / (Top20 + Other))
# save
write_csv(vuln_msoa, "data/CACI/digital-exclusion-proportions-msoa.csv")
write_csv(vuln_lsoa, "data/CACI/digital-exclusion-proportions-lsoa.csv")
# write_csv(vuln_lsoa, "data/CACI/vulnerability-lsoa.csv")
# write_csv(vuln_msoa, "data/CACI/vulnerability-msoa.csv")
##
## Aggregate postcode-level digital vulnerability into LSOAs and MSOAs by taking the mean scores withing MSOAs for each indicator then combining indicators
##
# run this code to see what the distributions of digital vulnerabilities within some random MSOAs look like
# vuln_msoa = vuln_raw %>%
# left_join(pc_msoa, by="Postcode2") %>%
# rename(MSOA11CD = msoa11) %>%
# filter(MSOA11CD %in% sample(pc_msoa$msoa11, 1))
# pairs(~ vul_dig_buyonl + vul_dig_manca_net + vul_dig_netusr + vul_dig_mobnon + vul_dig_brdbnd + vul_dig_confuse, data = vuln_msoa)
# hist(vuln_msoa$vul_dig_netusr)
vuln_msoa = vuln_raw %>%
left_join(pc_msoa, by="Postcode2") %>%
select(MSOA11CD = msoa11, contains("_dig_")) %>%
group_by(MSOA11CD) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
# Use MLFA model to combine indicators
weighted_domain_scores(model = "MLFA",
domain = "Digital") %>%
select(MSOA11CD, contains("Vulnerability"))
write_csv(vuln_msoa, "data/CACI/digital-exclusion-msoa.csv")
# do the same for LSOAs
vuln_lsoa = vuln_raw %>%
left_join(pc_lsoa, by="Postcode2") %>%
select(LSOA11CD = lsoa11, contains("_dig_")) %>%
group_by(LSOA11CD) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
# Use MLFA model to combine indicators
weighted_domain_scores(model = "MLFA",
domain = "Digital") %>%
select(LSOA11CD, contains("Vulnerability"))
write_csv(vuln_lsoa, "data/CACI/digital-exclusion-lsoa.csv")
# and for wards
vuln_ward = vuln_raw %>%
left_join(pc_ward, by="Postcode2") %>%
select(WD20CD = osward, contains("_dig_")) %>%
group_by(WD20CD) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
# Use MLFA model to combine indicators
weighted_domain_scores(model = "MLFA",
domain = "Digital") %>%
select(WD20CD, contains("Vulnerability"))
write_csv(vuln_ward, "data/CACI/digital-exclusion-ward.csv")
###############################################################################
## Financial vulnerability
##
##
## Aggregate postcode-level digital vulnerability into LSOAs and MSOAs by taking the mean scores withing MSOAs for each indicator then combining indicators
##
# run this code to see what the distributions of digital vulnerabilities within some random MSOAs look like
vuln_msoa = vuln_raw %>%
left_join(pc_msoa, by="Postcode2") %>%
rename(MSOA11CD = msoa11) %>%
filter(MSOA11CD %in% sample(pc_msoa$msoa11, 1)) %>%
select(contains("_fin_"))
ggpairs(vuln_msoa)
fin_inds = c("vul_fin_cur_std", "vul_fin_debt", "vul_fin_dep", "vul_fin_eqinc", "vul_fin_loan", "vul_fin_sav") # which indicators to use
# get data for MSOAs
vuln_msoa = vuln_raw %>%
left_join(pc_msoa, by="Postcode2") %>%
select(MSOA11CD = msoa11, !!fin_inds) %>%
group_by(MSOA11CD) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
# Use MLFA model to combine indicators
weighted_domain_scores(model = "MLFA",
domain = "Financial") %>%
select(MSOA11CD, contains("Vulnerability"))
write_csv(vuln_msoa, "data/CACI/financial-msoa.csv")
# do the same for LSOAs
vuln_lsoa = vuln_raw %>%
left_join(pc_lsoa, by="Postcode2") %>%
select(LSOA11CD = lsoa11, !!fin_inds) %>%
group_by(LSOA11CD) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
# Use MLFA model to combine indicators
weighted_domain_scores(model = "MLFA",
domain = "Financial") %>%
select(LSOA11CD, contains("Vulnerability"))
write_csv(vuln_lsoa, "data/CACI/financial-lsoa.csv")
# and for wards
vuln_ward = vuln_raw %>%
left_join(pc_ward, by="Postcode2") %>%
select(WD20CD = osward, !!fin_inds) %>%
group_by(WD20CD) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
# Use MLFA model to combine indicators
weighted_domain_scores(model = "MLFA",
domain = "Financial") %>%
select(WD20CD, contains("Vulnerability"))
write_csv(vuln_ward, "data/CACI/financial-ward.csv")
# check countries we have data for
# unique(str_sub(vuln_msoa$MSOA11CD, 1, 1))
# unique(str_sub(vuln_lsoa$LSOA11CD, 1, 1))