-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbreak_down_database.py
91 lines (79 loc) · 4.09 KB
/
break_down_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
from pathlib import Path
from tools import subset_of_columns
THIS_FILE = Path(__file__)
THIS_DIR = THIS_FILE.parent
INTERIM_DATA = THIS_DIR.joinpath("../../data/interim/")
PROCESSED_DATA = THIS_DIR.joinpath("../../data/processed/")
FULL_DB = "ripa-2018.csv"
data = pd.read_csv(INTERIM_DATA.joinpath(FULL_DB))
race_ethinicity = subset_of_columns('RAE_', data.columns)
gender = subset_of_columns('G_', data.columns)
disability = subset_of_columns('PD_', data.columns)
reason_for_stop = subset_of_columns('RFS_', data.columns)
action_taken = subset_of_columns('ADS_', data.columns)
basis_for_search = subset_of_columns('BFS_', data.columns)
contraband_evidence_discovered = subset_of_columns('CED_', data.columns)
basis_for_property_seizure = subset_of_columns('BPS_', data.columns)
type_of_property_seized = subset_of_columns('TPS_', data.columns)
result_of_stop = subset_of_columns('ROS_', data.columns)
# Create unique numeric id
data['UNIQUE_ID'] = data['DOJ_RECORD_ID'] + data['PERSON_NUMBER'].astype(str).str.zfill(2)
unique_id_mapping = {}
for idx,value in enumerate(data['UNIQUE_ID'].values, start=1_000_000):
unique_id_mapping[value] = idx
data['UNIQUE_ID'] = data['UNIQUE_ID'].map(unique_id_mapping)
identifiers = ['UNIQUE_ID']
## Build dfs
race_ethinicity_df = data[identifiers + race_ethinicity].copy()
gender_df = data[identifiers + gender].copy()
disability_df = data[identifiers + disability].copy()
reason_for_stop_df = data[identifiers + reason_for_stop].copy()
action_taken_df = data[identifiers + action_taken].copy()
basis_for_search_df = data[identifiers + basis_for_search].copy()
contraband_evidence_discovered_df = data[identifiers + contraband_evidence_discovered].copy()
basis_for_property_seizure_df = data[identifiers + basis_for_property_seizure].copy()
type_of_property_seized_df = data[identifiers + type_of_property_seized].copy()
result_of_stop_df = data[identifiers + result_of_stop].copy()
columns_left = [
col for col in data.columns if
col not in race_ethinicity and
col not in gender and
col not in disability and
col not in reason_for_stop and
col not in action_taken and
col not in basis_for_search and
col not in contraband_evidence_discovered and
col not in basis_for_property_seizure and
col not in type_of_property_seized and
col not in result_of_stop
]
base = data[columns_left].copy()
# Cleaning ADS
for col in action_taken_df.columns:
if action_taken_df[col].dtype == 'object':
action_taken_df[col] = action_taken_df[col].astype(str)
## Saving
base.set_index("UNIQUE_ID", inplace = True)
race_ethinicity_df.set_index("UNIQUE_ID", inplace = True)
gender_df.set_index("UNIQUE_ID", inplace = True)
disability_df.set_index("UNIQUE_ID", inplace = True)
reason_for_stop_df.set_index("UNIQUE_ID", inplace = True)
action_taken_df.set_index("UNIQUE_ID", inplace = True)
basis_for_search_df.set_index("UNIQUE_ID", inplace = True)
contraband_evidence_discovered_df.set_index("UNIQUE_ID", inplace = True)
basis_for_property_seizure_df.set_index("UNIQUE_ID", inplace = True)
type_of_property_seized_df.set_index("UNIQUE_ID", inplace = True)
result_of_stop_df .set_index("UNIQUE_ID", inplace = True)
print("Saving datasets...")
base.to_csv(PROCESSED_DATA / "aa_main_table.csv", encoding='utf-8')
race_ethinicity_df.to_csv(PROCESSED_DATA / "race_ethnicity.csv", encoding='utf-8')
gender_df.to_csv(PROCESSED_DATA / "gender.csv", encoding='utf-8')
disability_df.to_csv(PROCESSED_DATA / "disability.csv", encoding='utf-8')
reason_for_stop_df.to_csv(PROCESSED_DATA / "reason_for_stop.csv", encoding='utf-8')
action_taken_df.to_csv(PROCESSED_DATA / "action_taken.csv", encoding='utf-8')
basis_for_search_df.to_csv(PROCESSED_DATA / "basis_for_search.csv", encoding='utf-8')
contraband_evidence_discovered_df.to_csv(PROCESSED_DATA / "contraband_evidence_discovered.csv", encoding='utf-8')
basis_for_property_seizure_df.to_csv(PROCESSED_DATA / "basis_for_property_seizure.csv", encoding='utf-8')
type_of_property_seized_df.to_csv(PROCESSED_DATA / "type_of_property_seized.csv", encoding='utf-8')
result_of_stop_df.to_csv(PROCESSED_DATA / "result_of_stop.csv", encoding='utf-8')