-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpurge.py
18 lines (15 loc) · 838 Bytes
/
purge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import pandas as pd
# convert data/walmart.json to csv using pandas
df = pd.read_json("data/walmart.json")
# only keep label column
df = df[['label']]
# remove , and terminating space from label
df['label'] = df['label'].str.strip()
# remove last character if it is a comma
df['label'] = df['label'].str.rstrip(',')
df['label'] = df['label'].str.replace(r'(Select for details|Rollback|Save|save).*|\$\d+(\.\d+)?', '', regex=True)
df['label'] = df['label'].str.replace(r', , \.|, \.|, $|, , . $', '', regex=True)
df['label'] = df['label'].str.replace(r', , lb\. \.$', '', regex=True)
df['label'] = df['label'].str.replace(r', , lb\. \.$|, \d+ for or \d+(\.\d+)? each \.$', '', regex=True)
df['label'] = df['label'].str.replace(r', \d+ for or \d+(\.\d+)? each \.$', '', regex=True)
df.to_csv("data/walmart_partial.csv", index=False)