4
4
from sklearn .base import BaseEstimator , TransformerMixin
5
5
from sklearn .feature_extraction import DictVectorizer
6
6
7
- # Encodes Categorical variables to be numerical values
8
- # Usage:
7
+ # Encodes Categorical variables to be numerical values
8
+ # Usage:
9
9
# encoder = EncodeCategorical()
10
10
# encoder.fit_transform(data)
11
11
# For now, do not use since this does not work well with python3
37
37
May need runtime improvements
38
38
Parameters:
39
39
df: Dataframe to encode
40
- cols: Columns to encode. If None, then encode all object columns
40
+ cols: Columns to encode. If None, then encode all object columns
41
41
Returns:
42
42
1 Dimensionally encoded dataframe
43
43
"""
44
44
def encode_categorical (df , cols = None ):
45
- categorical = list ()
46
- if cols is not None :
47
- categorical = cols
48
- else :
49
- for col in df .columns :
50
- if df [col ].dtype == 'object' :
51
- categorical .append (col )
45
+ categorical = list ()
46
+ if cols is not None :
47
+ categorical = cols
48
+ else :
49
+ for col in df .columns :
50
+ if df [col ].dtype == 'object' :
51
+ categorical .append (col )
52
52
53
- for feature in categorical :
54
- l = list (df [feature ])
55
- s = set (l )
56
- l2 = list (s )
57
- numbers = list ()
58
- for i in range (0 ,len (l2 )):
59
- numbers .append (i )
60
- df [feature ] = df [feature ].replace (l2 , numbers )
61
- return df
53
+ for feature in categorical :
54
+ l = list (df [feature ])
55
+ s = set (l )
56
+ l2 = list (s )
57
+ numbers = list ()
58
+ for i in range (0 ,len (l2 )):
59
+ numbers .append (i )
60
+ df [feature ] = df [feature ].replace (l2 , numbers )
61
+ return df
62
62
63
63
"""
64
64
encode_onehot()
@@ -67,27 +67,23 @@ def encode_categorical(df, cols=None):
67
67
https://gist.github.com/ramhiser/982ce339d5f8c9a769a0
68
68
Parameters:
69
69
df: Dataframe to encode
70
- cols: Columns to encode. If None, then encode all object columns
70
+ cols: Columns to encode. If None, then encode all object columns
71
71
Returns:
72
72
1 Hot encoded dataframe
73
73
"""
74
74
def encode_onehot (df , cols = None ):
75
- categorical = list ()
76
- if cols is not None :
77
- categorical = cols
78
- else :
79
- for feature in df .columns :
80
- if df [feature ].dtype == 'object' :
81
- categorical .append (feature )
82
-
83
- vec = DictVectorizer ()
84
- vec_data = pd .DataFrame (vec .fit_transform (df [cols ].to_dict (outtype = 'records' )).toarray ())
85
- vec_data .columns = vec .get_feature_names ()
86
- vec_data .index = df .index
87
-
88
- df = df .drop (cols , axis = 1 )
89
- df = df .join (vec_data )
90
- return df
91
-
92
-
75
+ categorical = list ()
76
+ if cols is not None :
77
+ categorical = cols
78
+ else :
79
+ for feature in df .columns :
80
+ if df [feature ].dtype == 'object' :
81
+ categorical .append (feature )
82
+ vec = DictVectorizer ()
83
+ vec_data = pd .DataFrame (vec .fit_transform (df [cols ].to_dict (outtype = 'records' )).toarray ())
84
+ vec_data .columns = vec .get_feature_names ()
85
+ vec_data .index = df .index
93
86
87
+ df = df .drop (cols , axis = 1 )
88
+ df = df .join (vec_data )
89
+ return df
0 commit comments