-
Notifications
You must be signed in to change notification settings - Fork 0
/
write_clauses_json.py
158 lines (78 loc) · 2.59 KB
/
write_clauses_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import spacy
import sys
import json
def flag_terminals(ls, depth):
output = []
for element in ls:
if not isinstance(element, list) and len(list(element.children)) == depth:
output.append([element])
else:
output.append(element)
return output
def words_are_siblings(word1, word2):
return word1 == word2.head or word2 == word1.head
def lists_are_siblings(ls1, ls2):
for e1 in ls1:
for e2 in ls2:
if words_are_siblings(e1, e2):
return True
return False
def group_by_siblings(ls, size):
output = []
for index, chunk in enumerate(ls):
if isinstance(chunk, list):
new_sublist = chunk
while index + 1 < len(ls) and isinstance(ls[index + 1], list) and lists_are_siblings(chunk, ls[index + 1]) and len(chunk) + len(ls[index + 1]) <= size:
new_sublist += ls[index + 1]
del ls[index + 1]
output.append(new_sublist)
else:
output.append(chunk)
return output
def is_list_of_lists(ls):
for element in ls:
if not isinstance(element, list):
return False
return True
def pair_wise_conglomerate(chunks, size):
output = []
for index, chunk in enumerate(chunks):
new_sublist = chunk
while index + 1 < len(chunks) and len(chunk) + len(chunks[index + 1]) <= size:
new_sublist += chunks[index + 1]
del chunks[index + 1]
output.append(new_sublist)
return output
def convert_chunks_to_segmented_strings(chunks, sentence):
segments = []
for chunk in chunks:
beginning_index = chunk[0].i
end_index = chunk[-1].i
segment = doc[beginning_index:end_index + 1].text
segments.append(segment)
return segments
def return_clausal_segmentation(sentence, size):
chunks = [word for word in sentence]
# before the rest happens you should accumulate on punctuation first!
depth = 0
while not is_list_of_lists(chunks):
chunks = flag_terminals(chunks, depth)
chunks = group_by_siblings(chunks, size)
depth += 1
chunks = pair_wise_conglomerate(chunks, size)
segments = convert_chunks_to_segmented_strings(chunks, sentence)
return segments
nlp = spacy.load("de_core_news_sm")
nlp.max_length = 2000000
fn = sys.argv[1]
f = open(fn)
text = f.read()
doc = nlp(text)
sentences = [sentence for sentence in doc.sents if not sentence.text.isspace()]
clauses = []
for sentence in sentences:
new_clauses = return_clausal_segmentation(sentence, 5)
clauses = clauses + new_clauses
data = {"current_loc": 0, "segments": clauses}
f = open(fn[:-3] + "json", "w")
json.dump(data, f)