-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdata_tool.py
127 lines (104 loc) · 3.53 KB
/
data_tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
import os
from glob import glob
from collections import defaultdict
class DataTool:
def __init__(self):
self.data = []
def read(self, file_path: str) -> None:
"""
Read JSON file into this object.
"""
with open(file_path, 'r', encoding='utf-8') as file:
self.data = json.load(file)
print(f"Data read from {file_path}.")
def merge(self, data_dir: str) -> None:
"""
Merge all JSON files in data_dir into this object.
"""
for file_path in glob(os.path.join(data_dir, '*.json')):
with open(file_path, 'r', encoding='utf-8') as file:
print(f"Reading {file_path}...")
file_data = json.load(file)
if not isinstance(file_data, list):
print(f"Unsupported data type in file: {file_path}")
self.data.extend(file_data)
print(f"Data merged.")
def write(self, output_file: str) -> None:
with open(output_file, 'w', encoding='utf-8') as outfile:
json.dump(self.data, outfile, ensure_ascii=False, indent=2)
print(f"Data written to {output_file}.")
def stats(self) -> None:
"""
Some basic statistics
"""
nlq_count = defaultdict(int)
for entry in self.data:
domain = entry["domain"]
nlq_count[domain] += len(entry["NLQs"])
print("NLQ count per domain:")
for domain, count in nlq_count.items():
print(f"{domain}: {count}")
print(f"Total domains: {len(nlq_count)}")
print(f"Total NLQs: {sum(nlq_count.values())}")
def validate_entry(self, entry) -> bool:
"""
Validate entry data.
Here is the template for each entry in the data:
```
{
"domain": string,
"mapping": {
"properties": json_object,
"_meta": json_object
},
"NLQs": [
{
"NLQ": string,
"query": json_object
},
...
{
"NLQ": string,
"query": json_object
}
]
}
"""
print(f"Validating mapping")
if not isinstance(entry, dict):
print(f"Expected a dict type but got {type(entry)}: {entry}")
return False
if "domain" not in entry:
print(f"Missing domain: {entry}")
return False
if "mapping" not in entry:
print(f"Missing mapping: {entry}")
return False
if "NLQs" not in entry:
print(f"Missing NLQs: {entry}")
return False
if not isinstance(entry["NLQs"], list):
print(f"NLQs is not a list: {entry}")
return False
for nlq in entry["NLQs"]:
if "NLQ" not in nlq:
print(f"Missing NLQ: {nlq}")
return False
if "query" not in nlq:
print(f"Missing query: {nlq}")
return False
print (f"Valid mapping: {entry['domain']=}")
return True
def validate(self) -> None:
if not isinstance(self.data, list):
print("Data is not a list.")
return False
valid = True
for entry in self.data:
valid = valid and self.validate_entry(entry)
if valid:
print("Data is valid.")
else:
print("Data is invalid.")
return valid