-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbin_industries.py
103 lines (73 loc) · 4.31 KB
/
bin_industries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import pprint
import json
def writeToCSV(industryDict: dict, industriesFile: str, countMode: bool):
finalDF = pd.DataFrame()
for key in industryDict.keys():
values = industryDict[key]
if countMode:
row = {'industry': key, 'male': values[0], 'female': values[1], 'nonbinary': values[2], 'unspecified': values[3], 'diversity': values[4], 'disparityFactor': values[0]/(values[0]+ values[4]), 'equalityFactor': values[4]/(values[0]+ values[4])}
else:
row = {'industry': key, 'male': values[0], 'female': values[1], 'nonbinary': values[2], 'unspecified': values[3]}
tempDF = pd.DataFrame([row])
finalDF = pd.concat([finalDF, tempDF], ignore_index=True)
finalDF.to_csv(industriesFile, index=False)
def getCategoryGroups(orgDF: pd.DataFrame) -> dict:
'''
purpose:
takes in all possible industries of companies in our data & distills this down to 47 industries
how:
uses a dictionary to get a distinct set of industries (category groups) stored as keys
the value for each key stores a list where we have fraction founders by gender
we are counting how many companies were founded because it gives each person a fraction, not a count
i.e. this is counting funded companies by genders
design choices:
reason we use a dictionary is we map a set (unique industries as key) to a value, which is the count of gender
count reflects the number of founders in each category group (industry)
'''
categoryDict = dict()
uniqueIndustries = 0
blanks = 0
totalObservations = orgDF.shape[0]
for idx, row in orgDF.iterrows():
if pd.isna(row['categoryGroups']) or row['categoryGroups'] == 'N/A':
blanks += 1
continue
categoryGroups = row['categoryGroups'].split(', ')
for category in categoryGroups:
# step 1: check if key is already there
if category in categoryDict: # if yes, initialize counts
count = categoryDict[category]
# step 2: adding the pre-existing count to the new count
count = [count[0] + row['genderMalePercent'], count[1] + row['genderFemalePercent'], count[2] + row['genderNonBinaryPercent'], count[3] + row['genderNotProvidedPercent'], count[4] + row['genderDiversityPercent']]
# step 3: update dictionary
categoryDict[category] = count
else: # if no, we need to inialize the dictionary with a list that has 1 entry
categoryDict[category] = [row['genderMalePercent'], row['genderFemalePercent'], row['genderNonBinaryPercent'], row['genderNotProvidedPercent'], row['genderDiversityPercent']]
uniqueIndustries += 1 # program came across a new distinct industry, (which is the key in the dictionary)
# statistics
print(f'{blanks=}')
print(f'{uniqueIndustries=}')
fractionNull = round(blanks/totalObservations * 100, 2)
print(f'Fraction of industry data missing: {fractionNull}% ({blanks} out of {totalObservations})') # as of 4/21/2022: 1.74% (7471 out of 429935)
# output cleaned industries to a file
with open('datasets/cleanedIndustriesDict.json', 'w') as fp: # fp means file pointer
json.dump(categoryDict, fp, sort_keys=True, indent=4)
categoryDictScaled = dict()
for key, values in categoryDict.items():
# values is a list of fraction founders by gender
# calculate the sum of first 4 items of list bc they add up to a whole
founderSum = values[0] + values[1] + values[2] + values[3]
scaledValues = []
for value in values:
scaledValues.append(value/founderSum) # getting this to add up to 1
categoryDictScaled[key] = scaledValues
return [categoryDict, categoryDictScaled] # the total possible options from the category groups graph (a.k.a collection)
if __name__ == '__main__':
orgFile = 'datasets/organizations.csv.pickle'
industriesFounderCount = 'binned_output/industries_count.csv'
industriesFounderFraction = 'binned_output/industries_fraction.csv'
orgDF = pd.read_pickle(orgFile)
[categoryDict, categoryDictScaled] = getCategoryGroups(orgDF)
writeToCSV(categoryDict, industriesFounderCount, True)
writeToCSV(categoryDictScaled, industriesFounderFraction, False)