Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 84 additions & 48 deletions magicloops.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from __future__ import division
import pandas as pd
import numpy as np
from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn import preprocessing, cross_validation, svm, metrics, tree, \
decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, \
SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
Expand All @@ -21,65 +24,101 @@

%matplotlib inline


def define_clfs_params():

clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LR': LogisticRegression(penalty='l1', C=1e5),
'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'NB': GaussianNB(),
'DT': DecisionTreeClassifier(),
'SGD': SGDClassifier(loss="hinge", penalty="l2"),
'KNN': KNeighborsClassifier(n_neighbors=3)
'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1,
criterion='entropy'),
'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
algorithm="SAMME", n_estimators=200),
'LR': LogisticRegression(penalty='l1', C=1e5),
'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5,
max_depth=6, n_estimators=10),
'NB': GaussianNB(),
'DT': DecisionTreeClassifier(),
'SGD': SGDClassifier(loss="hinge", penalty="l2"),
'KNN': KNeighborsClassifier(n_neighbors=3)
}

grid = {
'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
'NB' : {},
'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
}
grid = {
'RF': {'n_estimators': [1, 10, 100, 1000, 10000],
'max_depth': [1, 5, 10, 20, 50, 100],
'max_features': ['sqrt', 'log2'],
'min_samples_split': [2, 5, 10]
},
'LR': {'penalty': ['l1', 'l2'],
'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
},
'SGD': {'loss': ['hinge', 'log', 'perceptron'],
'penalty': ['l2', 'l1', 'elasticnet']
},
'ET': {'n_estimators': [1, 10, 100, 1000, 10000],
'criterion': ['gini', 'entropy'],
'max_depth': [1, 5, 10, 20, 50, 100],
'max_features': ['sqrt', 'log2'],
'min_samples_split': [2, 5, 10]
},
'AB': {'algorithm': ['SAMME', 'SAMME.R'],
'n_estimators': [1, 10, 100, 1000, 10000]
},
'GB': {'n_estimators': [1, 10, 100, 1000, 10000],
'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],
'subsample': [0.1, 0.5, 1.0],
'max_depth': [1, 3, 5, 10, 20, 50, 100]
},
'NB': {},
'DT': {'criterion': ['gini', 'entropy'],
'max_depth': [1, 5, 10, 20, 50, 100],
'max_features': ['sqrt', 'log2'],
'min_samples_split': [2, 5, 10]
},
'SVM': {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
'kernel': ['linear']
},
'KNN': {'n_neighbors': [1, 5, 10, 25, 50, 100],
'weights': ['uniform', 'distance'],
'algorithm': ['auto', 'ball_tree', 'kd_tree']
}
}
return clfs, grid


def clf_loop(models_to_run, clfs, grid, X, y):
for n in range(1, 2):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
for index,clf in enumerate([clfs[x] for x in models_to_run]):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0)
for index, clf in enumerate([clfs[x] for x in models_to_run]):
print models_to_run[index]
parameter_values = grid[models_to_run[index]]
for p in ParameterGrid(parameter_values):
try:
clf.set_params(**p)
print clf
y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
#threshold = np.sort(y_pred_probs)[::-1][int(.05*len(y_pred_probs))]
#print threshold
print precision_at_k(y_test,y_pred_probs,.05)
#plot_precision_recall_n(y_test,y_pred_probs,clf)
y_pred_probs = clf.fit(
X_train, y_train).predict_proba(X_test)[:, 1]
# threshold = np.sort(y_pred_probs)[::-1][int(.05 *
# len(y_pred_probs))]
# print threshold
print precision_at_k(y_test, y_pred_probs, .05)
# plot_precision_recall_n(y_test,y_pred_probs,clf)
except IndexError, e:
print 'Error:',e
print 'Error:', e
continue



def plot_precision_recall_n(y_true, y_prob, model_name):
from sklearn.metrics import precision_recall_curve
y_score = y_prob
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(
y_true, y_score)
precision_curve = precision_curve[:-1]
recall_curve = recall_curve[:-1]
pct_above_per_thresh = []
number_scored = len(y_score)
for value in pr_thresholds:
num_above_thresh = len(y_score[y_score>=value])
num_above_thresh = len(y_score[y_score >= value])
pct_above_thresh = num_above_thresh / float(number_scored)
pct_above_per_thresh.append(pct_above_thresh)
pct_above_per_thresh = np.array(pct_above_per_thresh)
Expand All @@ -91,32 +130,29 @@ def plot_precision_recall_n(y_true, y_prob, model_name):
ax2 = ax1.twinx()
ax2.plot(pct_above_per_thresh, recall_curve, 'r')
ax2.set_ylabel('recall', color='r')

name = model_name
plt.title(name)
#plt.savefig(name)
# plt.savefig(name)
plt.show()


def precision_at_k(y_true, y_scores, k):
threshold = np.sort(y_scores)[::-1][int(k*len(y_scores))]
threshold = np.sort(y_scores)[::-1][int(k * len(y_scores))]
y_pred = np.asarray([1 if i >= threshold else 0 for i in y_scores])
return metrics.precision_score(y_true, y_pred)


def main():
def main():
clfs, grid = define_clfs_params()
models_to_run=['KNN','RF','LR','ET','AB','GB','NB','DT']
#get X and y
features = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'age', 'NumberOfTimes90DaysLate']
models_to_run = ['KNN', 'RF', 'LR', 'ET', 'AB', 'GB', 'NB', 'DT']
# get X and y
features = ['RevolvingUtilizationOfUnsecuredLines',
'DebtRatio', 'age', 'NumberOfTimes90DaysLate']
X = df[features]
y = df.SeriousDlqin2yrs
clf_loop(models_to_run, clfs,grid, X,y)

clf_loop(models_to_run, clfs, grid, X, y)


if __name__ == '__main__':
main()