Skip to content

Commit 05b14aa

Browse files
committed
parallel grid search in ml_validation
1 parent e4b746c commit 05b14aa

File tree

1 file changed

+55
-14
lines changed

1 file changed

+55
-14
lines changed

validation/FEIII2016/ml/ml_validation.py

+55-14
Original file line numberDiff line numberDiff line change
@@ -4,39 +4,80 @@
44
from sklearn.svm import SVC
55
from sklearn import cross_validation
66
from sklearn import grid_search
7+
import sklearn.metrics
8+
9+
10+
#PARSE ARGUMENTS
711

812
parser = optparse.OptionParser()
913
parser.add_option('-g','--gold', dest = 'gold_standard_name', help = 'gold_standard_name')
14+
parser.add_option('-t','--train', dest = 'training_set', help='training set name')
15+
parser.add_option('-k','--test', dest = 'test_set', help='test set name')
1016

1117
(options, args) = parser.parse_args()
1218

1319
if options.gold_standard_name is None:
1420
options.gold_standard_name = raw_input('Enter gold standard file name:')
1521

22+
if options.training_set is None:
23+
options.training_set = options.gold_standard_name
24+
25+
training_set = options.training_set
26+
1627
gold_standard_name = options.gold_standard_name
1728

18-
data = pd.read_csv(gold_standard_name)
29+
test_set = options.test_set
30+
31+
#READ DATA
32+
33+
gs_ml_whole = pd.read_csv(gold_standard_name) #WHOLE GS IN ML FORMAT
34+
35+
train_duke = pd.read_csv(training_set) #TRAINING SET IN DUKE FORMAT
36+
37+
test_duke = pd.read_csv(test_set) #TEST SET IN DUKE FORMAT
38+
39+
#define the IDS to use for testing and select test set in ML format
40+
41+
ids_1_test = test_duke.values[:,1]
42+
43+
ids_2_test = test_duke.values[:,2]
1944

20-
X = data.values[:,2:-1] #x variables, the last one is the y
21-
y = np.array(data['y']) #class variables
45+
test_ml = gs_ml_whole[(gs_ml_whole.FFIEC_ID.isin(ids_1_test)) & (gs_ml_whole.SEC_ID.isin(ids_2_test))] #those that are in the test data and have the shape of the 1,1,1,0,0,1
2246

23-
#print X,y
47+
#define the IDS to use for train and select training set in ML format
48+
49+
ids_1_train = train_duke.values[:,1]
50+
51+
ids_2_train = train_duke.values[:,2]
52+
53+
train_ml = gs_ml_whole[(gs_ml_whole.FFIEC_ID.isin(ids_1_train)) & (gs_ml_whole.SEC_ID.isin(ids_2_train))]
54+
55+
X_train = train_ml.values[:,2:-1] #x variables
56+
57+
y_train = np.array(train_ml['y']) #class variables
58+
59+
X_test = test_ml.values[:,2:-1] #x variables
60+
61+
y_test = np.array(test_ml['y']) #class variables
2462

2563
#fit an SVM with rbf kernel
2664
clf = SVC( kernel = 'rbf',cache_size = 1000)
27-
#parameters = [{'kernel' : ['rbf'],'gamma' : np.logspace(-9,3,30),'C': np.logspace(-2,10,30)}, {'kernel' : ['linear'], 'C': np.logspace(-2,10,30)}]
65+
2866
parameters = {'gamma' : np.logspace(-9,3,30),'C': np.logspace(-2,10,30)}
2967

30-
gs_rbf = grid_search.GridSearchCV(clf,param_grid=parameters,cv = 4)
31-
gs_rbf.fit(X,y)
68+
gs_rbf = grid_search.GridSearchCV(clf,param_grid=parameters,cv = 4, n_jobs = -1)
69+
gs_rbf.fit(X_train,y_train)
70+
71+
#save the output
72+
73+
gs_rbf = gs_rbf.best_estimator_
74+
75+
output = gs_rbf.predict(X_test)
3276

33-
#print the results
77+
p = sklearn.metrics.precision_score(y_test,output)
3478

35-
#compute the cross validation score
36-
clf = gs_rbf.best_estimator_
79+
r = sklearn.metrics.recall_score(y_test, output)
3780

38-
precision_cross_scores = cross_validation.cross_val_score(clf, X, y, cv = 4, scoring = 'precision')
39-
recall_cross_scores = cross_validation.cross_val_score(clf, X, y, cv = 4, scoring = 'recall')
40-
f1_cross_scores = cross_validation.cross_val_score(clf, X, y, cv = 4, scoring = 'f1')
81+
f = sklearn.metrics.f1_score(y_test,output)
4182

42-
print "%.3f,%.3f,%.3f" %(np.mean(precision_cross_scores),np.mean(recall_cross_scores),np.mean(f1_cross_scores))
83+
print "%.3f,%.3f,%.3f" %(p,r,f)

0 commit comments

Comments
 (0)