rayidghani · mkiang · Apr 17, 2017
diff --git a/magicloops.py b/magicloops.py
@@ -1,9 +1,12 @@
 from __future__ import division
 import pandas as pd
 import numpy as np
-from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm
-from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
-from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
+from sklearn import preprocessing, cross_validation, svm, metrics, tree, \
+    decomposition, svm
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
+    GradientBoostingClassifier, AdaBoostClassifier
+from sklearn.linear_model import LogisticRegression, Perceptron, \ 
+    SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
 from sklearn.neighbors.nearest_centroid import NearestCentroid
 from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
 from sklearn.tree import DecisionTreeClassifier
@@ -21,65 +24,101 @@
 
 %matplotlib inline
 
+
 def define_clfs_params():
 
     clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
-        'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
-        'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
-        'LR': LogisticRegression(penalty='l1', C=1e5),
-        'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
-        'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
-        'NB': GaussianNB(),
-        'DT': DecisionTreeClassifier(),
-        'SGD': SGDClassifier(loss="hinge", penalty="l2"),
-        'KNN': KNeighborsClassifier(n_neighbors=3) 
+            'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, 
+                  criterion='entropy'),
+            'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), 
+                  algorithm="SAMME", n_estimators=200),
+            'LR': LogisticRegression(penalty='l1', C=1e5),
+            'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
+            'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5,
+                  max_depth=6, n_estimators=10),
+            'NB': GaussianNB(),
+            'DT': DecisionTreeClassifier(),
+            'SGD': SGDClassifier(loss="hinge", penalty="l2"),
+            'KNN': KNeighborsClassifier(n_neighbors=3)
             }
 
-    grid = { 
-    'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
-    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
-    'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
-    'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
-    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
-    'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
-    'NB' : {},
-    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
-    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
-    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
-           }
+    grid = {
+        'RF': {'n_estimators': [1, 10, 100, 1000, 10000], 
+               'max_depth': [1, 5, 10, 20, 50, 100], 
+               'max_features': ['sqrt', 'log2'], 
+               'min_samples_split': [2, 5, 10]
+               },
+        'LR': {'penalty': ['l1', 'l2'], 
+               'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
+               },
+        'SGD': {'loss': ['hinge', 'log', 'perceptron'], 
+                'penalty': ['l2', 'l1', 'elasticnet']
+                },
+        'ET': {'n_estimators': [1, 10, 100, 1000, 10000], 
+               'criterion': ['gini', 'entropy'], 
+               'max_depth': [1, 5, 10, 20, 50, 100], 
+               'max_features': ['sqrt', 'log2'], 
+               'min_samples_split': [2, 5, 10]
+               },
+        'AB': {'algorithm': ['SAMME', 'SAMME.R'], 
+               'n_estimators': [1, 10, 100, 1000, 10000]
+               },
+        'GB': {'n_estimators': [1, 10, 100, 1000, 10000], 
+               'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5], 
+               'subsample': [0.1, 0.5, 1.0], 
+               'max_depth': [1, 3, 5, 10, 20, 50, 100]
+               },
+        'NB': {},
+        'DT': {'criterion': ['gini', 'entropy'], 
+               'max_depth': [1, 5, 10, 20, 50, 100], 
+               'max_features': ['sqrt', 'log2'], 
+               'min_samples_split': [2, 5, 10]
+               },
+        'SVM': {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 
+                'kernel': ['linear']
+                },
+        'KNN': {'n_neighbors': [1, 5, 10, 25, 50, 100], 
+                'weights': ['uniform', 'distance'], 
+                'algorithm': ['auto', 'ball_tree', 'kd_tree']
+                }
+    }
     return clfs, grid
 
+
 def clf_loop(models_to_run, clfs, grid, X, y):
     for n in range(1, 2):
-        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
-        for index,clf in enumerate([clfs[x] for x in models_to_run]):
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=0)
+        for index, clf in enumerate([clfs[x] for x in models_to_run]):
             print models_to_run[index]
             parameter_values = grid[models_to_run[index]]
             for p in ParameterGrid(parameter_values):
                 try:
                     clf.set_params(**p)
                     print clf
-                    y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
-                    #threshold = np.sort(y_pred_probs)[::-1][int(.05*len(y_pred_probs))]
-                    #print threshold
-                    print precision_at_k(y_test,y_pred_probs,.05)
-                    #plot_precision_recall_n(y_test,y_pred_probs,clf)
+                    y_pred_probs = clf.fit(
+                        X_train, y_train).predict_proba(X_test)[:, 1]
+                    # threshold = np.sort(y_pred_probs)[::-1][int(.05 * 
+                    #                                       len(y_pred_probs))]
+                    # print threshold
+                    print precision_at_k(y_test, y_pred_probs, .05)
+                    # plot_precision_recall_n(y_test,y_pred_probs,clf)
                 except IndexError, e:
-                    print 'Error:',e
+                    print 'Error:', e
                     continue
 
 
-
 def plot_precision_recall_n(y_true, y_prob, model_name):
     from sklearn.metrics import precision_recall_curve
     y_score = y_prob
-    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
+    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(
+        y_true, y_score)
     precision_curve = precision_curve[:-1]
     recall_curve = recall_curve[:-1]
     pct_above_per_thresh = []
     number_scored = len(y_score)
     for value in pr_thresholds:
-        num_above_thresh = len(y_score[y_score>=value])
+        num_above_thresh = len(y_score[y_score >= value])
         pct_above_thresh = num_above_thresh / float(number_scored)
         pct_above_per_thresh.append(pct_above_thresh)
     pct_above_per_thresh = np.array(pct_above_per_thresh)
@@ -91,32 +130,29 @@ def plot_precision_recall_n(y_true, y_prob, model_name):
     ax2 = ax1.twinx()
     ax2.plot(pct_above_per_thresh, recall_curve, 'r')
     ax2.set_ylabel('recall', color='r')
-    
+
     name = model_name
     plt.title(name)
-    #plt.savefig(name)
+    # plt.savefig(name)
     plt.show()
 
+
 def precision_at_k(y_true, y_scores, k):
-    threshold = np.sort(y_scores)[::-1][int(k*len(y_scores))]
+    threshold = np.sort(y_scores)[::-1][int(k * len(y_scores))]
     y_pred = np.asarray([1 if i >= threshold else 0 for i in y_scores])
     return metrics.precision_score(y_true, y_pred)
 
 
-def main(): 
+def main():
     clfs, grid = define_clfs_params()
-    models_to_run=['KNN','RF','LR','ET','AB','GB','NB','DT']
-    #get X and y
-    features  =  ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'age', 'NumberOfTimes90DaysLate']
+    models_to_run = ['KNN', 'RF', 'LR', 'ET', 'AB', 'GB', 'NB', 'DT']
+    # get X and y
+    features = ['RevolvingUtilizationOfUnsecuredLines',
+                'DebtRatio', 'age', 'NumberOfTimes90DaysLate']
     X = df[features]
     y = df.SeriousDlqin2yrs
-    clf_loop(models_to_run, clfs,grid, X,y)
-
+    clf_loop(models_to_run, clfs, grid, X, y)
 
 
 if __name__ == '__main__':
     main()
-
-
-
-