-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathModel.py
158 lines (113 loc) · 5.33 KB
/
Model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 16 21:05:09 2019
@author: Peter Samoaa
"""
#### Importing Libraries ####
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import time
dataset = pd.read_csv('new_appdata10.csv')
#### Data Pre-Processing ####
# Splitting Independent and Response Variables
response = dataset["enrolled"]
dataset = dataset.drop(columns="enrolled")
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset, response,
test_size = 0.2,
random_state = 0)
# Removing Identifiers for temporal purpose because at the end of the model we want to associate prediction to each user.
train_identity = X_train['user']
X_train = X_train.drop(columns = ['user'])
test_identity = X_test['user']
X_test = X_test.drop(columns = ['user'])
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train)) # here we use dataframe because scaler return an array, whih remove the index (which requiered in order to map with user id) and column names which we need in our model training
X_test2 = pd.DataFrame(sc_X.transform(X_test))
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values # assign columns to the new testing set
X_train2.index = X_train.index.values # assign columns to the new training set
X_test2.index = X_test.index.values
X_train = X_train2 # compare original training set with new training set.
X_test = X_test2
# Now all the features have normilized
#### Model Building ####
# Fitting Model to the Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, penalty = 'l1') # the penalty changed the model from regular logistic regression into regularized
# even we handle the screen correlation by funnels, but it maybe also the funnels correlated to each other. So L1 which called Lasso, it penalizes any particular field that it's extremly correlated to the responsible variable.
classifier.fit(X_train, y_train)
# Predicting Test Set
y_pred = classifier.predict(X_test)
# Evaluating Results
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred) # how accurate the model is
precision_score(y_test, y_pred) # tp / (tp + fp) # check the overfitting
recall_score(y_test, y_pred) # tp / (tp + fn) # out of actual positive observation how many we predict them as positive.
f1_score(y_test, y_pred)
# plot confustion matrix
df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, fmt='g')
print("Test Data Accuracy: %0.4f" % accuracy_score(y_test, y_pred))
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("SVM Accuracy: %0.3f (+/- %0.3f)" % (accuracies.mean(), accuracies.std() * 2))
# Analyzing Coefficients
pd.concat([pd.DataFrame(dataset.drop(columns = 'user').columns, columns = ["features"]),
pd.DataFrame(np.transpose(classifier.coef_), columns = ["coef"])
],axis = 1)
#### Model Tuning ####
## Grid Search (Round 1)
from sklearn.model_selection import GridSearchCV
# Select Regularization Method
penalty = ['l1', 'l2']
# Create regularization hyperparameter space
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# Combine Parameters
parameters = dict(C=C, penalty=penalty)
grid_search = GridSearchCV(estimator = classifier,
param_grid = parameters,
scoring = "accuracy",
cv = 10,
n_jobs = -1)
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters
## Grid Search (Round 2)
# Select Regularization Method
penalty = ['l1', 'l2']
# Create regularization hyperparameter space
C = [0.1, 0.5, 0.9, 1, 2, 5]
# Combine Parameters
parameters = dict(C=C, penalty=penalty)
grid_search = GridSearchCV(estimator = classifier,
param_grid = parameters,
scoring = "accuracy",
cv = 10,
n_jobs = -1)
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters
grid_search.best_score_
#### End of Model ####
# Formatting Final Results
final_results = pd.concat([y_test, test_identity], axis = 1).dropna()
final_results['predicted_reach'] = y_pred
final_results = final_results[['user', 'enrolled', 'predicted_reach']].reset_index(drop=True) # we don't want any index