diff --git a/magicloops.py b/magicloops.py index 9e0d897..1dcc4c5 100644 --- a/magicloops.py +++ b/magicloops.py @@ -1,9 +1,12 @@ from __future__ import division import pandas as pd import numpy as np -from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm -from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier -from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression +from sklearn import preprocessing, cross_validation, svm, metrics, tree, \ + decomposition, svm +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \ + GradientBoostingClassifier, AdaBoostClassifier +from sklearn.linear_model import LogisticRegression, Perceptron, \ + SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression from sklearn.neighbors.nearest_centroid import NearestCentroid from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.tree import DecisionTreeClassifier @@ -21,65 +24,101 @@ %matplotlib inline + def define_clfs_params(): clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1), - 'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'), - 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), - 'LR': LogisticRegression(penalty='l1', C=1e5), - 'SVM': svm.SVC(kernel='linear', probability=True, random_state=0), - 'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), - 'NB': GaussianNB(), - 'DT': DecisionTreeClassifier(), - 'SGD': SGDClassifier(loss="hinge", penalty="l2"), - 'KNN': KNeighborsClassifier(n_neighbors=3) + 'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, + criterion='entropy'), + 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), + algorithm="SAMME", n_estimators=200), + 'LR': LogisticRegression(penalty='l1', C=1e5), + 'SVM': svm.SVC(kernel='linear', probability=True, random_state=0), + 'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, + max_depth=6, n_estimators=10), + 'NB': GaussianNB(), + 'DT': DecisionTreeClassifier(), + 'SGD': SGDClassifier(loss="hinge", penalty="l2"), + 'KNN': KNeighborsClassifier(n_neighbors=3) } - grid = { - 'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]}, - 'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]}, - 'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']}, - 'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]}, - 'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]}, - 'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]}, - 'NB' : {}, - 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]}, - 'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']}, - 'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']} - } + grid = { + 'RF': {'n_estimators': [1, 10, 100, 1000, 10000], + 'max_depth': [1, 5, 10, 20, 50, 100], + 'max_features': ['sqrt', 'log2'], + 'min_samples_split': [2, 5, 10] + }, + 'LR': {'penalty': ['l1', 'l2'], + 'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10] + }, + 'SGD': {'loss': ['hinge', 'log', 'perceptron'], + 'penalty': ['l2', 'l1', 'elasticnet'] + }, + 'ET': {'n_estimators': [1, 10, 100, 1000, 10000], + 'criterion': ['gini', 'entropy'], + 'max_depth': [1, 5, 10, 20, 50, 100], + 'max_features': ['sqrt', 'log2'], + 'min_samples_split': [2, 5, 10] + }, + 'AB': {'algorithm': ['SAMME', 'SAMME.R'], + 'n_estimators': [1, 10, 100, 1000, 10000] + }, + 'GB': {'n_estimators': [1, 10, 100, 1000, 10000], + 'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5], + 'subsample': [0.1, 0.5, 1.0], + 'max_depth': [1, 3, 5, 10, 20, 50, 100] + }, + 'NB': {}, + 'DT': {'criterion': ['gini', 'entropy'], + 'max_depth': [1, 5, 10, 20, 50, 100], + 'max_features': ['sqrt', 'log2'], + 'min_samples_split': [2, 5, 10] + }, + 'SVM': {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], + 'kernel': ['linear'] + }, + 'KNN': {'n_neighbors': [1, 5, 10, 25, 50, 100], + 'weights': ['uniform', 'distance'], + 'algorithm': ['auto', 'ball_tree', 'kd_tree'] + } + } return clfs, grid + def clf_loop(models_to_run, clfs, grid, X, y): for n in range(1, 2): - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) - for index,clf in enumerate([clfs[x] for x in models_to_run]): + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=0) + for index, clf in enumerate([clfs[x] for x in models_to_run]): print models_to_run[index] parameter_values = grid[models_to_run[index]] for p in ParameterGrid(parameter_values): try: clf.set_params(**p) print clf - y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1] - #threshold = np.sort(y_pred_probs)[::-1][int(.05*len(y_pred_probs))] - #print threshold - print precision_at_k(y_test,y_pred_probs,.05) - #plot_precision_recall_n(y_test,y_pred_probs,clf) + y_pred_probs = clf.fit( + X_train, y_train).predict_proba(X_test)[:, 1] + # threshold = np.sort(y_pred_probs)[::-1][int(.05 * + # len(y_pred_probs))] + # print threshold + print precision_at_k(y_test, y_pred_probs, .05) + # plot_precision_recall_n(y_test,y_pred_probs,clf) except IndexError, e: - print 'Error:',e + print 'Error:', e continue - def plot_precision_recall_n(y_true, y_prob, model_name): from sklearn.metrics import precision_recall_curve y_score = y_prob - precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score) + precision_curve, recall_curve, pr_thresholds = precision_recall_curve( + y_true, y_score) precision_curve = precision_curve[:-1] recall_curve = recall_curve[:-1] pct_above_per_thresh = [] number_scored = len(y_score) for value in pr_thresholds: - num_above_thresh = len(y_score[y_score>=value]) + num_above_thresh = len(y_score[y_score >= value]) pct_above_thresh = num_above_thresh / float(number_scored) pct_above_per_thresh.append(pct_above_thresh) pct_above_per_thresh = np.array(pct_above_per_thresh) @@ -91,32 +130,29 @@ def plot_precision_recall_n(y_true, y_prob, model_name): ax2 = ax1.twinx() ax2.plot(pct_above_per_thresh, recall_curve, 'r') ax2.set_ylabel('recall', color='r') - + name = model_name plt.title(name) - #plt.savefig(name) + # plt.savefig(name) plt.show() + def precision_at_k(y_true, y_scores, k): - threshold = np.sort(y_scores)[::-1][int(k*len(y_scores))] + threshold = np.sort(y_scores)[::-1][int(k * len(y_scores))] y_pred = np.asarray([1 if i >= threshold else 0 for i in y_scores]) return metrics.precision_score(y_true, y_pred) -def main(): +def main(): clfs, grid = define_clfs_params() - models_to_run=['KNN','RF','LR','ET','AB','GB','NB','DT'] - #get X and y - features = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'age', 'NumberOfTimes90DaysLate'] + models_to_run = ['KNN', 'RF', 'LR', 'ET', 'AB', 'GB', 'NB', 'DT'] + # get X and y + features = ['RevolvingUtilizationOfUnsecuredLines', + 'DebtRatio', 'age', 'NumberOfTimes90DaysLate'] X = df[features] y = df.SeriousDlqin2yrs - clf_loop(models_to_run, clfs,grid, X,y) - + clf_loop(models_to_run, clfs, grid, X, y) if __name__ == '__main__': main() - - - -