12
12
impute_strategy: What to replace missing values with
13
13
Options:
14
14
Imputer Class
15
- 'most frequent '
16
- 'median'
17
- 'mean'
15
+ 'mode '
16
+ 'median' - numerical
17
+ 'mean' - numerical
18
18
Custom Functions
19
19
'remove'
20
20
'dummy'
@@ -30,59 +30,63 @@ def impute_missing(data, columns=None, impute_strategy='mode', missing_values='N
30
30
cols_to_impute = _find_cols_with_missing_vals (data , missing_values )
31
31
else :
32
32
cols_to_impute = columns
33
- if cols_to_impute == None :
33
+ if not cols_to_impute :
34
34
return datacopy
35
35
if impute_strategy == 'mode' :
36
+ print (cols_to_impute )
36
37
for col in cols_to_impute :
37
38
modeVal = data [col ].mode ()
38
- if missing_values == 'NaN' :
39
- datacopy [col ] = data [col ].fillna (modeVal [0 ])
40
- else :
41
- datacopy [col ] = data [col ].replace (missing_values , modeVal [0 ], regex = True )
39
+ print (modeVal [0 ])
40
+ datacopy [col ] = _fill_col (data [col ], missing_values , modeVal [0 ])
42
41
return datacopy
43
42
elif impute_strategy == 'mean' :
44
43
for col in cols_to_impute :
45
- meanVal = data [col ].mean ()
46
- if missing_values == 'NaN' :
47
- datacopy [col ] = data [col ]. fillna ( meanVal )
44
+ if data [col ].dtype != 'object' :
45
+ meanVal = data [ col ]. mean ()
46
+ datacopy [col ] = _fill_col ( data [col ], missing_values , meanVal )
48
47
else :
49
- datacopy [col ] = data [col ]. replace ( missing_values , meanVal , regex = True )
50
- return datacopy
48
+ datacopy [col ] = _fill_col ( data [col ], missing_values , dummy_val )
49
+ return datacopy
51
50
elif impute_strategy == 'median' :
52
51
for col in cols_to_impute :
53
- medianVal = data [col ].median ()
54
- if missing_values == 'NaN' :
55
- datacopy [col ] = data [col ]. fillna ( medianVal )
52
+ if data [col ].dtype != 'object' :
53
+ medianVal = data [ col ]. median ()
54
+ datacopy [col ] = _fill_col ( data [col ], missing_values , medianVal )
56
55
else :
57
- datacopy [col ] = data [col ]. replace ( missing_values , medianVal , regex = True )
58
- return datacopy
56
+ datacopy [col ] = _fill_col ( data [col ], missing_values , dummy_val )
57
+ return datacopy
59
58
elif impute_strategy == 'drop column' :
60
59
return _remove_columns (data , cols_to_impute )
61
60
elif impute_strategy == 'maximum' :
62
61
for col in cols_to_impute :
63
- maxVal = max ( data [col ])
64
- if missing_values == 'NaN' :
65
- datacopy [col ] = data [col ]. fillna ( maxVal )
62
+ if data [col ]. dtype != 'object' :
63
+ maxVal = max ( data [ col ])
64
+ datacopy [col ] = _fill_col ( data [col ], missing_values , maxVal )
66
65
else :
67
- datacopy [col ] = data [col ]. replace ( missing_values , maxVal , regex = True )
68
- return data
66
+ datacopy [col ] = _fill_col ( data [col ], missing_values , dummy_val )
67
+ return datacopy
69
68
elif impute_strategy == 'minimum' :
70
69
for col in cols_to_impute :
71
- minVal = min ( data [col ])
72
- if missing_values == 'NaN' :
73
- datacopy [col ] = data [col ]. fillna ( minVal )
70
+ if data [col ]. dtype != 'object' :
71
+ minVal = min ( data [ col ])
72
+ datacopy [col ] = _fill_col ( data [col ], missing_values , minVal )
74
73
else :
75
- datacopy [col ] = data [col ]. replace ( missing_values , minVal , regex = True )
76
- return data
74
+ datacopy [col ] = _fill_col ( data [col ], missing_values , dummy_val )
75
+ return datacopy
77
76
elif impute_strategy == 'dummy' :
78
- return data .replace (missing_values , dummy_val , regex = True )
77
+ for col in cols_to_impute :
78
+ if data [col ].dtype != 'object' :
79
+ datacopy [col ] = _fill_col (data [col ], missing_values , 0 )
80
+ else :
81
+ datacopy [col ] = _fill_col (data [col ], missing_values , dummy_val )
82
+ return datacopy
79
83
# Do some more research on this before implementing
80
84
elif impute_strategy == 'rand_forest_reg' :
81
85
print ("RANDOM FOREST REGRESSOR NOT IMPLEMENTED NO IMPUTATION HAPPENED" )
82
- return None
86
+ return datacopy
83
87
else :
84
- print ("REPLACE COMMAND NOT RECOGNIZED" )
85
- return None
88
+ print ("REPLACE COMMAND NOT RECOGNIZED NO IMPUTATION HAPPENED " )
89
+ return datacopy
86
90
87
91
"""
88
92
remove_columns()
@@ -109,3 +113,11 @@ def _find_cols_with_missing_vals(data=None, missing_values= 'NaN'):
109
113
if data [col ].str .contains (missing_values ).any ():
110
114
cols_to_impute .append (col )
111
115
return cols_to_impute
116
+
117
+ def _fill_col (column , missing_values , replace_val ):
118
+ ret = column
119
+ if missing_values == 'NaN' :
120
+ ret = column .fillna (replace_val )
121
+ else :
122
+ ret = column .replace (missing_values , replace_val , regex = True )
123
+ return ret
0 commit comments