-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_speeches.r
72 lines (62 loc) · 1.9 KB
/
parse_speeches.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
suppressMessages(library(tidyverse))
suppressMessages(library(rvest))
suppressMessages(library(parallel))
set.seed(1)
files <- dir('./data/',full.names=TRUE)
#files <- files[sample(1:length(files),size=1e4)]
cores <- detectCores()
#returns the xml data without the header
get_xscript <- function(x){
x %>%
read_file %>%
read_xml %>%
xml_children %>%
.[2]
}
talkers <- mclapply(mc.cores = cores,files,function(x){tryCatch({get_xscript(x) %>%
xml_find_all('//name') %>%
xml_text},error=function(e)NA_character_)}
)
speeches <- mclapply(mc.cores = cores,files,function(x){
tryCatch({speeches <- x %>%
get_xscript %>%
xml_find_all('//body') %>%
xml_children
map(speeches,xml_text) %>%
keep(~.x!='')
},error=function(e)NA_character_)
}
)
names(speeches) <- files
speeches <- speeches[map_lgl(talkers,~length(.x)>0)]
parliamentarian_name_regex <- '^(Reverend the Hon.|Mr|Ms|Mrs|The Hon.|The Hon. Dr|Dr|The) [A-Za-z\\.]+.*:'
process_fragment <- function(x){
topic <- x[[1]]
x[[1]] <- NULL
statements <- list()
i <- 1
current_speaker <- NA
for (paragraph in x){
paragraph <- str_trim(paragraph) # some start with multiple whitespaces
speaker <- str_extract(paragraph,parliamentarian_name_regex)
if (is.na(speaker)){
speaker <- current_speaker
if (length(statements)==i){
statements[[i]] <- paste(statements[[i]],paragraph,sep=rep('\n',100))
} else {
statements[[i]] <- paragraph
names(statements)[i] <- speaker
}
} else {
i <- i+1
statements[[i]] <- paragraph
names(statements)[i] <- speaker
}
current_speaker <- speaker
}
statements <- keep(statements,~!is.null(.x))
tibble(topic = rep(topic,length(statements)),speaker=names(statements),statement=unlist(statements))
}
speeches <- mclapply(mc.cores=cores,speeches,process_fragment) %>%
bind_rows %>%
feather::write_feather('speeches.feather')