-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathholdout_comparison.py
102 lines (80 loc) · 3.6 KB
/
holdout_comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Holdout test set comparison
Compare the final classifier to other network architectures.
"""
from typing import List, Tuple
import itertools
import argparse
import logging
import json
import pickle
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text
import sklearn.pipeline
import sklearn.ensemble
import sklearn.model_selection
import sklearn.metrics
import sklearn.svm
import sklearn.neighbors
import sklearn.linear_model
import tensorflow as tf
import tensorflow.keras as keras
# LABEL_MAP is the mapping from class names to class indices.
LABEL_MAP = {'Thermo': 0, 'Psychro': 1}
# RANDOM_SEED is the initial RNG state used to divide the dataset into training-test sets
# and cross-validation folds.
RANDOM_SEED = 1
# HOLDOUT_SET_SIZE is the ratio of points to set aside for the final holdout test set (unused in this experiment.)
# In our experiment, we set aside 10% of the points for the holdout set.
HOLDOUT_SET_SIZE = 0.1
def model_report(name, model, X_holdout_t, y_holdout):
holdout_predictions = np.round(model.predict(X_holdout_t))
report = sklearn.metrics.classification_report(y_holdout, holdout_predictions, output_dict=True)
report['classifier'] = name
return pd.json_normalize(report, sep='-')
def main(args):
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
dataset = pd.read_csv(args.datapath, names=(
"Classification", "Label", "Sequence"))
X, y = dataset['Sequence'], dataset['Label'].map(LABEL_MAP).to_numpy()
X_cv, X_holdout, y_cv, y_holdout = sklearn.model_selection.train_test_split(
X, y, shuffle=True, test_size=HOLDOUT_SET_SIZE, random_state=RANDOM_SEED)
with open(args.pipeline, 'rb') as pipeline_file:
pipeline = pickle.load(pipeline_file)
X_cv_t = pipeline.transform(X_cv).toarray()
X_holdout_t = pipeline.transform(X_holdout).toarray()
our_model = keras.models.load_model(args.model)
our_report = model_report('ours', our_model, X_holdout_t, y_holdout)
rf_model = sklearn.ensemble.RandomForestClassifier(max_depth=3)
rf_model.fit(X_cv_t, y_cv)
rf_report = model_report('rf', rf_model, X_holdout_t, y_holdout)
lr_model = sklearn.linear_model.LogisticRegression()
lr_model.fit(X_cv_t, y_cv)
lr_report = model_report('lr', lr_model, X_holdout_t, y_holdout)
knn_model = sklearn.neighbors.KNeighborsClassifier()
knn_model.fit(X_cv_t, y_cv)
knn_report = model_report('knn', knn_model, X_holdout_t, y_holdout)
svm_model = sklearn.svm.SVC()
svm_model.fit(X_cv_t, y_cv)
svm_report = model_report('svm', svm_model, X_holdout_t, y_holdout)
df = pd.concat([
our_report, rf_report, lr_report, svm_report, knn_report
])
logging.info("Saving results to " + args.results)
df.to_csv(args.results)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--datapath", default="dataset/database-pdb.csv",
help="The path to the .CSV with the experiment data.")
parser.add_argument("--results", default="final-results.csv",
help="Where to save the results of the classifier comparison.")
parser.add_argument("--pipeline", default="psychornot-pipeline",
help="Where to find the preprocessing pipeline for the model.")
parser.add_argument("--model", default="psychornot-ensemble",
help="Where to find the ensemble of the best performing models.")
parser.add_argument("--verbose", action="store_true",
help="Log results to the console.")
args = parser.parse_args()
main(args)