|
| 1 | +# Surgeo accuracy analysis using collected demographic data |
| 2 | +import numpy as np |
| 3 | +import pandas as pd |
| 4 | +import surgeo |
| 5 | + |
| 6 | +# Tree plenish event demographic responses... pre-downloaded |
| 7 | +df = pd.read_csv("/home/justinmiller/Documents/Data/tpEventDemographic.csv") |
| 8 | + |
| 9 | +names = df[df.columns[2]].values |
| 10 | +last_names = [name.split(" ")[-1] if name.split(" ")[-1] else name.split(" ")[-2] for name in names] |
| 11 | + |
| 12 | +fsg = surgeo.SurnameModel() |
| 13 | + |
| 14 | +fsg_results = fsg.get_probabilities(pd.Series(last_names)) |
| 15 | +print(str(len(fsg_results)) + " Total Entries") |
| 16 | + |
| 17 | +df = pd.concat([df, fsg_results], axis = 1) |
| 18 | + |
| 19 | +df = df.dropna(subset = ['white']) |
| 20 | +print(str(len(df)) + " Usable Surnames") |
| 21 | + |
| 22 | +pd.set_option('display.max_columns', None) |
| 23 | +print(df) |
| 24 | + |
| 25 | +# Mapping keys for comparison row by row |
| 26 | +r = 5 # Column with race question |
| 27 | +keys = np.unique(df[df.columns[r]]) # Race row |
| 28 | +keyMap = {} |
| 29 | +keyMap[keys[0]] = "api" |
| 30 | +keyMap[keys[1]] = "black" |
| 31 | +keyMap[keys[2]] = "hispanic" |
| 32 | +keyMap[keys[3]] = "multiple" |
| 33 | +keyMap[keys[6]] = "white" |
| 34 | + |
| 35 | +# Looping through to find percent estimate of actual race |
| 36 | +probList = np.array([]) |
| 37 | + |
| 38 | +for i in df.index: |
| 39 | + |
| 40 | + race = df[df.columns[r]][i] |
| 41 | + try: |
| 42 | + probability = df[keyMap[race]][i] |
| 43 | + probList = np.append(probList, probability) |
| 44 | + except: |
| 45 | + probList = np.append(probList, np.nan) |
| 46 | + print("The user answered: " + race) |
| 47 | + |
| 48 | + |
| 49 | +# Adding problist back onto df |
| 50 | +df = pd.concat([df, pd.DataFrame(probList, index = df.index)], axis = 1) |
| 51 | +df.columns[-1] = "probability" |
| 52 | + |
0 commit comments