-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_controller.py
113 lines (104 loc) · 4.87 KB
/
data_controller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import spacy
import json
import time
from spacy.pipeline import EntityRuler
from custom_pattern_override import fix_imsi_imei
TEST = """Facebook CEO Mark Zuckerburg, owner of the popular website myspace.com, emailed from [email protected] to give me his phone number +15717890987, but also noted his secrtary's number is 398-498-4847.
This was quite unexpected since just last Wednesday Mark noted Sheila was not interested in meeting us for lunch. I will email Mark back from [email protected].
He also sent me a the links for our dashboard https://dfsfsfsff.ru/dashboard/overview:4040. He said there was something fishy going on.
It had the normal IP he was used to 192.156.3.345, but the ipv6 was completely off (2001:0db8:85a3:0000:0000:8a2e:0370:7334). This is something we need to look into.
Oh, also. Steve mentioned in may be in Germany this week. He plans to travel back to the US in a few days.
The phone number is zzzzzzzzzzzz, the imsi is 334384724034918, and the imei is 572393739374467.
"""
def document_to_dict(raw_text=TEST, pattern_help=True):
nlp = spacy.load("en_core_web_md")
if pattern_help:
ruler = EntityRuler(nlp, overwrite_ents=True)
# matcher = Matcher(nlp.vocab)
# it may need to get more compliated than this
with open("configs/custom_patterns.json", 'r') as f:
PATTERNS = json.load(f)
# "^\\+?\\(?0{0,2}((?:9[679]|8[035789]|6[789]|5[90]|42|3[578]|2[1-689])|9[0-58]|8[1246]|6[0-6]|5[1-8]|4[013-9]|3[0-469]|2[70]|7|1)(?:\\W*\\d){0,13}\\d$"
for l in PATTERNS:
nlp.vocab.strings.add(l["label"])
ruler.add_patterns(PATTERNS)
nlp.add_pipe(ruler, before='ner')
doc = nlp(raw_text)
json_doc = doc.to_json()
with open("spacy_json_example.json", "w") as f:
json.dump(json_doc, f, indent=4)
DATASET = {"text": [], "spans": {}, "deps": {}}
for token in json_doc["tokens"]:
DATASET["text"].append({"id": int(token["start"]), "word": json_doc["text"][int(token["start"]):int(
token["end"])], "start": token["start"], "end": token["end"]})
try:
if token["dep"] != "":
if token["dep"] in DATASET["deps"].keys():
DATASET["deps"][token["dep"]].append(
str(token["start"]) + ":" + str(token["end"]))
else:
DATASET["deps"][token["dep"]] = [
str(token["start"]) + ":" + str(token["end"])]
except:
pass
for ents in json_doc["ents"]:
if ents["label"] == "IMEI" or ents["label"] == "IMSI":
new_start = fix_imsi_imei(ents["end"], json_doc)
if new_start:
if DATASET["spans"].get(ents["label"]):
DATASET["spans"][ents["label"]].append(
str(new_start) + ":" + str(ents["end"]))
else:
DATASET["spans"][ents["label"]] = [
str(new_start) + ":" + str(ents["end"])]
continue
if DATASET["spans"].get(ents["label"]):
DATASET["spans"][ents["label"]].append(
str(ents["start"]) + ":" + str(ents["end"]))
else:
DATASET["spans"][ents["label"]] = [
str(ents["start"]) + ":" + str(ents["end"])]
return DATASET
# for sent in json_doc["sents"]:
# sent_dict = {"uuid": uuid}
# start = sent["start"]
# end = sent["end"]
#
# sent_list = []
# for token in json_doc["tokens"]:
# if token["end"] > end:
# break
# elif token["start"] >= start:
# sent_list.append(
# {'id': i, 'word': json_doc["text"][token["start"]:token["end"]]})
# i += 1
#
# spans = {}
# for ents in json_doc["ents"]:
# if ents["end"] > end:
# break
# elif ents["start"] >= start:
# if ents["label"] in spans.keys():
# spans[ents["label"]].append(
# str(ents["start"] - start) + ":" + str(ents["end"] - start))
# else:
# spans.update(
# {ents["label"]: [str(ents["start"] - start) + ":" + str(ents["end"] - start)]})
# deps = {}
# for token in json_doc["tokens"]:
# if token["end"] > end:
# break
# elif token["start"] >= start:
# if token["dep"] in deps.keys():
# deps[token["dep"]].append(
# str(token["start"] - start) + ":" + str(token["end"] - start))
# else:
# deps.update(
# {token["dep"]: [str(token["start"] - start) + ":" + str(token["end"] - start)]})
# sent_dict.update(
# {"sentence": sent_list, "spans": spans, "deps": deps})
# DATASET.append(sent_dict)
# uuid += 1
# print(DATASET)
def convert_to_training(tags):
print()