-
Notifications
You must be signed in to change notification settings - Fork 15
/
functions.r
339 lines (279 loc) · 14.2 KB
/
functions.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#' Add numbers only if at least one of them isn't zero or NA
#' If both numbers are zero or NA, return zero
#'
#' @param x First number
#' @param y Second number
#'
#' @examples
#' 1 %++% 1 # == 2
#' 1 %++% NA # == 1
`%++%` = function(x, y) {
if ( (is.na(x) | x == 0) & (is.na(y) | y == 0) ) {
0
} else {
ifelse(is.na(x), 0, x) + ifelse(is.na(y), 0, y)
}
}
#' Calculate risk quantiles
#'
#' @param risk.col The data to quantise
#' @param quants Number of quantiles (default: 5)
#' @param highest.quantile.is.worst Should a risk score of 1 represent the highest/worst number in the data (FALSE) or the lowest/best (FALSE)?
#' @param style Method to use for calculating quantiles (passed to classIntervals; default: Fisher). One of "fixed", "sd", "equal", "pretty", "quantile", "kmeans", "hclust", "bclust", "fisher", "jenks" or "dpih"
#' @param samp_prop The proportion of samples to use, if slicing using "fisher" or "jenks" (passed to classIntervals; default: 100%)
#' @return A vector containing the risk quantiles
#'
calc_risk_quantiles = function(risk.col, quants = 5, highest.quantile.is.worst = TRUE, style = "fisher", samp_prop = 1) {
if (length(unique(risk.col)) > 1) { # only calculate quintiles if there are more unique values than quantiles
# calculate the quantile breaks
q_breaks = classInt::classIntervals(risk.col, quants, style = style, samp_prop = samp_prop, largeN = length(risk.col))
q = as.integer(cut(risk.col, breaks = q_breaks$brks, include.lowest = T)) # create a column with the risk quantiles as a number (e.g. from 1 to 5, if using quintiles)
if (!highest.quantile.is.worst) {
max_quant = max(q, na.rm = TRUE) # get the max. quantile in the dataset (won't always be equal to `quants`, e.g. if nrows(d) < quants)
q = (max_quant + 1) - q # reverse the quantile scoring so 1 = highest risk
}
q # return the quantiles
} else {
1
}
}
#' invert deciles
#' @param x Deciles
#'
reverse_decile = function(x) 11 - x
#' Invert deciles, ranks, percentiles etc.
#' e.g. with decile, a score of 10 --> 1 and score of 1 --> 10
#' @param x Vector of data to invert
invert_this = function(x) (max(x, na.rm = TRUE) + 1) - x
# Wrap long ggplot2 titles
ggplot_title_wrapper <- function(x, ...)
{
paste(strwrap(x, ...), collapse = "\n")
}
#########################################################################################################
## functions for calculating the vulnerability index, in the style of the Index of Multiple Deprivation
##
#' Transform data to an exponential distribution
#' using the exponential transformation function listed in Welsh IMD's tech report - see Appendix A: https://gov.wales/sites/default/files/statistics-and-research/2020-02/welsh-index-multiple-deprivation-2019-technical-report.pdf
#'
#' @param x The data to transform
#'
exp_transform = function(x) -23 * log(1 - x * (1 -exp(-100/23)))
#' Normalise ranks to a range between 0 and 1
#'
#' @param x List of ranks
#'
scale_ranks = function(x) (x - 1) / (length(x) - 1)
#' Rank indicators but put NAs first (i.e. least-worst)
#' @param x Data to rank
#'
rank2 = function(x) rank(x, na.last = FALSE)
#' Inverse ranking with NAs first (i.e. 1 = worst)
#' @param x Data to rank
#'
inverse_rank = function(x) (length(x) + 1) - rank(x, na.last = FALSE)
#' Calculate domain scores, ranks and deciles
#' This function will calculate over all numeric variables in a dataframe
#'
#' @param d Dataframe containing underlying indicators
#' @param domain Name of the vulnerability domain - if not blank, prepend this string to the vulnerability score/rank/decile columns
#' @param rank.indicators Rank the underlying indicators before calculating scores etc.?
#' @param keep.interim.indicators If TRUE, keep the scores and ranks calculated for each underlying indicator (default: FALSE)
#' @param bespoke.domains If TRUE, ignore the four weights parameters (below) and just add equally across all the variables passed into this function
#' @param clinical_weight Weight (between 0 and 1, inclusive) to assign to clinical vulnerability domain
#' @param health_weight Weight (between 0 and 1, inclusive) to assign to health/wellbeing vulnerability domain
#' @param economic_weight Weight (between 0 and 1, inclusive) to assign to economic vulnerability domain
#' @param social_weight Weight (between 0 and 1, inclusive) to assign to social vulnerability domain
#'
calc_domain_scores = function(d,
domain = NULL,
rank.indicators = TRUE,
keep.interim.indicators = FALSE,
bespoke.domains = FALSE,
clinical_weight = 0.25,
health_weight = 0.25,
economic_weight = 0.25,
social_weight = 0.25) {
# Check that weights add to 1.
# All weights don't have to be inputted.
if(sum(clinical_weight,
health_weight,
economic_weight,
social_weight) != 1){
stop("The supplied weights don't sum to 1!")
}
if (rank.indicators) {
d = d %>%
mutate_if(is.numeric, list(rank = rank2)) # convert indicators to ranks
}
d = d %>%
# normalise the ranks so they're between 0 and 1
mutate_at(vars(ends_with("rank")), list(scaled = scale_ranks)) %>%
# transform to an exponential distribution
mutate_at(vars(ends_with("scaled")), exp_transform)
# calculate overall domain score
if (bespoke.domains) {
# combine with equal weights to get domain score
d = d %>%
mutate(`Vulnerability score` = reduce(select(., ends_with("scaled")), `+`)) # source: https://stackoverflow.com/a/54527609
} else {
d = d %>%
# Add row-wise grouping to sum weighted ranks across rows
rowwise() %>%
# combine with equal weights to get domain score
mutate(`Vulnerability score` = sum(`Clinical Vulnerability rank_scaled` * clinical_weight,
`Health/Wellbeing Vulnerability rank_scaled` * health_weight,
`Economic Vulnerability rank_scaled` * economic_weight,
`Social Vulnerability rank_scaled` * social_weight)) %>%
# Remove row-wise grouping
ungroup()
}
# calculate domain ranks and deciles
d = d %>%
mutate(`Vulnerability rank` = rank(`Vulnerability score`)) %>%
mutate(`Vulnerability decile` = calc_risk_quantiles(`Vulnerability rank`, quants = 10))
if (is.character(domain))
names(d)[ grepl("^Vulnerability", names(d)) ] = paste(domain, grep("^Vulnerability", names(d), value = T), sep = " ")
if (!keep.interim.indicators)
# keep only original indicators and domain vulnerability score/rank/decile
d = d %>% select(-ends_with("_rank"), -ends_with("scaled"))
d
}
# ----- Domain Scores (Weighted Indicators) -----
# Calculate vulnerability domain scores with weighted indicators
# Methodology
# 1. Scale each indicator to Mean = 0, SD = 1
# 2. Perform either PCA or MLFA and extract weights for that domain
# (use model argument to specify)
# 3. Multiply model weights by respective column to get weighted indicators
# 4. Sum weighted indicators
# 5. Rank and quantise into deciles
# Additional Libraries Required
library(broom)
# Create standardised function that scales each indicator to mean = 0 & SD = 1.
standardised = function(x) (x - mean(x))/sd(x)
# Create weighted domain function
weighted_domain_scores <- function(d,
model = c("PCA", "MLFA"),
domain = NULL,
keep.interim.indicators = FALSE) {
# Rank and normalise indicators to mean 0, SD 1.
d <- d %>%
mutate_if(is.numeric, list(scaled = function(x) standardised(rank2(x))))
# Evaluate model choice
model <- match.arg(model)
# Extract weights
if(model == "PCA"){
d_weights <- d %>%
select(ends_with("_scaled")) %>%
prcomp(center = FALSE, scale = FALSE) %>%
pluck("rotation") %>%
as.data.frame() %>%
rownames_to_column(var = "variable") %>%
as_tibble() %>%
select(variable, weights = PC1) %>%
mutate(weights = weights^2)
} else {
d_weights <- d %>%
select(ends_with("_scaled")) %>%
factanal(factors = 1) %>%
tidy() %>%
select(-uniqueness, weights = fl1) %>%
mutate(weights = abs(weights),
weights = weights/sum(weights))
}
# Multiply model weights by respective column to get weighted indicators
d_weighted_ind <- d %>%
select(d_weights$variable) %>%
map2_dfc(d_weights$weights, `*`) %>%
select_all(list(~ str_remove(., "_scaled"))) %>%
select_all(list(~ str_c(., "_weighted")))
# Combine weighted indicators with original data
d <- bind_cols(d, d_weighted_ind)
# Sum weighted indicators
d <- d %>%
mutate(`Vulnerability score` = reduce(select(., ends_with("_weighted")), `+`))
# Rank and quantise into deciles
d <- d %>%
mutate(`Vulnerability rank` = rank(`Vulnerability score`)) %>%
mutate(`Vulnerability decile` = calc_risk_quantiles(`Vulnerability rank`, quants = 10))
# Add domain prefix if supplied
if (is.character(domain))
names(d)[ grepl("^Vulnerability", names(d)) ] = paste(domain, grep("^Vulnerability", names(d), value = T), sep = " ")
# Keep/drop interim indicator
if (!keep.interim.indicators)
# keep only original indicators and domain vulnerability score/rank/decile
d = d %>% select(-ends_with("_scaled"), -ends_with("_weighted"))
# Return data
return(d)
}
# ----- Aggregate Vulnerability Index into higher geographies (e.g. Local Authorities) -----
#' Calculate the Extent and population-weighted average scores for small areas (MSOAs)
#'
#' "Extent" is the proportion of the local population that live in areas classified as among the most deprived in the higher geography.
#' To calculate this, we need to first calculate a weighted score based on the population in the most deprived 30% of areas
#'
#' @param d Dataframe containing Vulnerability Index ranks, scores and population estimates
#' @param domain Which Vulnerability domain to calculate population-weighted scores for
#' @param population_col Name of the column containing population estimates
#' @param max_rank Let user set the highest rank of the domain/indicator (defaults to finding max rank in the data)
#'
pop_weighted_scores = function(d, domain = NULL, aggregate_by, population_col = "No. people", max_rank = NULL) {
rank_col = ifelse(is.null(domain), "Vulnerability rank", paste0(domain, " Vulnerability rank"))
score_col = ifelse(is.null(domain), "Vulnerability score", paste0(domain, " Vulnerability score"))
if (is.null(max_rank))
max_rank = max(d[[rank_col]])
d %>%
# group_by(!!sym(aggregate_by)) %>%
# mutate(Percentile = round((!!sym(rank_col) / max_rank) * 100, 0)) %>%
mutate(Percentile = ntile(!!sym(rank_col), 100)) %>%
# invert percentiles because, in the Vulnerability Index, higher percentiles mean higher vulnerability - but the extent score calculation below assumes lower percentiles mean higher vulnerability
mutate(Percentile = invert_this(Percentile)) %>%
# calculate extent: a weighted score based on the population in the most deprived 30% of areas
# from the IMD technical report Appendix N:
# "The population living in the most deprived 11 to 30 per cent of Lower-layer Super Output Areas
# receive a sliding weight, ranging from 0.95 for those in the most deprived eleventh percentile,
# to 0.05 for those in the most deprived thirtieth percentile.
# In practice this means that the weight starts from 0.95 in the most deprived eleventh percentile,
# and then decreases by (0.95-0.05)/19 for each of the subsequent nineteen percentiles
# until it reaches 0.05 for the most deprived thirtieth percentile, and zero for areas outside the most deprived 30 per cent"
mutate(Extent = case_when(
Percentile <= 10 ~ !!sym(population_col),
Percentile == 11 ~ !!sym(population_col) * 0.95,
Percentile > 11 & Percentile <= 30 ~ !!sym(population_col) * (0.95 - ((0.9/19) * (Percentile - 11))),
TRUE ~ 0
)) %>%
# calculate population-weighted average scores
mutate(Score = !!sym(score_col) * !!sym(population_col)) %>%
ungroup()
}
#' Aggregate Vulnerability Index into higher-level geographies, calculating:
#' - proportion of highly vulnerable areas
#' - extent (proportion of the local population that live the most vulnerable areas)
#' - population-weighted average score
#'
#' @param d Dataframe containing Vulnerability Index ranks, scores and population estimates
#' @param domain Which Vulnerability domain to calculate population-weighted scores for
#' @param aggregate_by Name of the column to use for higher-level aggregation (e.g. "LAD19CD")
#' @param population_col Name of the column containing population estimates
#' @param max_rank Let user set the highest rank of the domain/indicator (defaults to finding max rank in the data)
#'
aggregate_scores = function(d, domain = NULL, aggregate_by, population_col = "No. people", max_rank = NULL) {
decile_col = ifelse(is.null(domain), "Vulnerability decile", paste0(domain, " Vulnerability decile"))
# calculate proportions of highly vulnerable MSOAs in the higher-level geography
d_props = d %>%
# label MSOAs by whether they're in top 20% most-vulnerable then summarise by this label
mutate(Top20 = ifelse(!!sym(decile_col) >= 9, "Top20", "Other")) %>%
janitor::tabyl(!!sym(aggregate_by), Top20) %>%
# calculate proportion of most deprived LSOAs
mutate(Proportion = Top20 / (Top20 + Other)) %>%
select(!!sym(aggregate_by), Proportion)
# calculate population-weighted scores and extent for the higher-level geography
d_scores = d %>%
pop_weighted_scores(domain, aggregate_by, population_col = population_col, max_rank = max_rank) %>%
group_by(!!sym(aggregate_by)) %>%
summarise(Extent = sum(Extent) / sum(!!sym(population_col)),
Score = sum(Score) / sum(!!sym(population_col)))
# combine and return all aggregated measures
left_join(d_props, d_scores, by = aggregate_by)
}