-
Notifications
You must be signed in to change notification settings - Fork 3
/
parser.js
131 lines (111 loc) · 3.1 KB
/
parser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
const allCities = require("./data/newAllCities.json");
const resourceTypes = require("./data/resources.json");
const categoriesObj = require("./data/categories.json");
const normalize = (text) => {
return text
.toLowerCase()
.split(/ |\n|\t|\.|,/g)
.filter((i) => i)
.join("");
};
const find = (text, values) => {
const set = new Set();
for (let key in values) {
for (let word of values[key]) {
if (text.search(word) != -1) {
set.add(key);
}
}
}
return Array.from(set) || [];
};
const findResourceType = (text) => {
return find(text, resourceTypes);
};
const findLocation = (text) => {
let location = new Set();
for (const state in allCities) {
for (const city of allCities[state]) {
for (const keyword of city.keywords) {
if (text.search(keyword) != -1) {
location.add({ state: state, city: city.name });
}
}
}
}
return Array.from(location) || [];
};
const phoneRegex =
/(?!([0]?[1-9]|[1|2][0-9]|[3][0|1])[./-]([0]?[1-9]|[1][0-2])[./-]([0-9]{4}|[0-9]{2}))(\+?\d[\d -]{8,12}\d)/g;
const emailRegex =
/^(([^<>()[\]\.,;:\s@\"]+(\.[^<>()[\]\.,;:\s@\"]+)*)|(\".+\"))@(([^<>()[\]\.,;:\s@\"]+\.)+[^<>()[\]\.,;:\s@\"]{2,})$/g;
const parsePhoneNumbers = (text) => {
return [
...new Set(
(text.match(phoneRegex) || [])
.concat(text.replace(/\s+/g, "@").match(phoneRegex) || [])
.map((phone) => phone.replace(/\s+|-/g, ""))
.map((phone) =>
phone.length == 10
? phone
: phone.length > 10 && phone[0] == "0"
? phone.substring(0, 11)
: phone.substring(phone.length - 10)
)
),
].filter((_) => _);
};
const parseTweet = (raw_text) => {
const text = normalize(raw_text);
const resourceTypes = findResourceType(text);
const categories = resourceTypes.map((r) => categoriesObj[r]).flat() || [];
const resource_types = resourceTypes || [];
const emails = raw_text.match(emailRegex) || [];
const locations = findLocation(text) || null;
const phone_numbers = parsePhoneNumbers(raw_text);
const obj = {
categories,
resource_types,
phone_numbers,
emails,
locations,
};
//console.log(`Extracted Data object: ${obj ? obj : null}`);
return obj;
};
const parseContacts = (raw_text) => {
const phones = parsePhoneNumbers(raw_text);
if (!phones || phones.length == 0) {
return [];
}
const contacts = [];
const arr = raw_text.split(phones);
arr.pop();
for (const [index, raw_text] of arr.entries()) {
const text = normalize(raw_text);
const resourceTypes =
findResourceType(text) ||
(contacts[index - 1] || {}).resource_types ||
[];
contacts.push({
categories: resourceTypes.map((r) => categoriesObj[r]),
resource_types: resourceTypes,
phone: phones[index],
emails: raw_text.match(emailRegex) || [],
locations: findLocation(text),
});
}
return contacts;
};
module.exports = {
categoriesObj,
resourceTypes,
normalize,
find,
parseTweet,
parseContacts,
allCities,
parsePhoneNumbers,
findLocation,
findResourceType,
};