-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgridsearch.py
327 lines (269 loc) · 12.8 KB
/
gridsearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Luca Gioacchini
"""
This script trains several multi-modal autoencoder for traffic application
classification performing a grid search on the architecture. Namely, the grid
search varies the number of neurons in the adaptation layers and in the
bottlneck.
It loads the features and stratified-k-folds order, manages the
training and validation datasets, builds the model, trains it, and retrieve the
embeddings transforming the whole dataset.
Then it trains a deep classifier to validate the embeddings produced by the MAE.
The trained models and the embeddings are saved to disk. The script takes as
input the number of epochs, batch size, and fold number to use for training and
validation.
Usage example
-------------
$ python gridsearch.py --epochs 20 --batch_size 128 --fold_number 3
"""
import sys
sys.path.append('../') # Make mltoolbox and utils reachable
import argparse
from mltoolbox.representation import MultimodalAE
from mltoolbox.classification import DeepClassifier
from utils import get_datasets
from keras import layers
import pandas as pd
import numpy as np
import joblib
# Features path
FEATURES = '../data/task01/features'
MAE = '../data/task01/mae'
def build_model(shapes, l1, l4):
"""
Builds a multimodal autoencoder model.
Parameters
----------
shapes : tuple
A tuple containing the shapes of the input modalities.
The tuple should be of the form (input_shape, P, St, Se, I), where:
- input_shape is a tuple representing the shape of the input tensor.
- P is the number of payload embeddings in the input tensor.
- St is the number of statistical features in the input tensor.
- Se is the number of sequence features in the input tensor.
- I is the number of IP address embeddings in the input tensor.
l1 : int
The number of units in the first hidden layer.
l4 : int
The number of units in the fourth hidden layer.
Returns
-------
keras.Model
The multimodal autoencoder model.
"""
input_shape, P, St, Se, I = shapes
# Define the classifier architecture
inputs = layers.Input(input_shape,)
# Encoder branch of modality 1 - payload embeddings
hidden1 = layers.Lambda(lambda x: x[:, :P])(inputs)
hidden1 = layers.Reshape((-1, 1))(hidden1)
hidden1 = layers.Dense(64, activation='relu')(hidden1)
hidden1 = layers.Conv1D(32, 3, activation='relu')(hidden1)
hidden1 = layers.Dropout(.3)(hidden1)
hidden1 = layers.MaxPooling1D(2)(hidden1)
hidden1 = layers.Dropout(.3)(hidden1)
hidden1 = layers.Flatten()(hidden1)
hidden1 = layers.Dense(l1, activation='relu')(hidden1)
# Encoder branch of modality 2 - statistics
hidden2 = layers.Lambda(lambda x: x[:, P:St])(inputs)
hidden2 = layers.Dense(l1, activation='relu')(hidden2)
# Encoder branch of modality 3 - sequences
hidden3 = layers.Lambda(lambda x: x[:, P+St:P+St+Se])(inputs)
hidden3 = layers.Reshape((32, 4))(hidden3)
hidden3 = layers.Conv1D(32, 3, activation='relu')(hidden3)
hidden3 = layers.Dropout(.3)(hidden3)
hidden3 = layers.MaxPooling1D(2)(hidden3)
hidden3 = layers.Dropout(.3)(hidden3)
hidden3 = layers.Flatten()(hidden3)
hidden3 = layers.Dense(l1, activation='relu')(hidden3)
# Encoder branch of modality 4 - ip address embeddings
hidden4 = layers.Lambda(lambda x: x[:, P+St+Se:P+St+Se+I])(inputs)
hidden4 = layers.Dense(l1, activation='relu')(hidden4)
# Concatenate
hidden = layers.Concatenate()([hidden1, hidden2, hidden3, hidden4])
# Common encoder
hidden = layers.Dense(512, activation='relu')(hidden)
hidden = layers.Dense(256, activation='relu')(hidden)
# Bottleneck
hidden = layers.Dense(l4, activation='relu', name='Coded')(hidden)
# Common decoder
hidden = layers.Dense(256, activation='relu')(hidden)
hidden = layers.Dense(512, activation='relu')(hidden)
hidden = layers.Dense(l1*4, activation='relu')(hidden)
# Decoder branch of modality 1 - payload embeddings
hidden1 = layers.Dense(l1, activation='relu')(hidden)
hidden1 = layers.Dense(480, activation='relu')(hidden1)
hidden1 = layers.Reshape((15, 32))(hidden1)
hidden1 = layers.Conv1D(32, 3, strides=2, activation="relu",
padding="same")(hidden1)
hidden1 = layers.UpSampling1D(2)(hidden1)
hidden1 = layers.Conv1D(4, 3, activation="relu", padding="same")(hidden1)
hidden1 = layers.UpSampling1D(2)(hidden1)
hidden1 = layers.Flatten()(hidden1)
output1 = layers.Dense(P, activation='linear', name='payload')(hidden1)
# Decoder branch of modality 2 - statistics
hidden2 = layers.Dense(l1, activation='relu')(hidden)
output2 = layers.Dense(St, activation='linear', name='statistics')(hidden2)
# Decoder branch of modality 3 - sequences
hidden3 = layers.Dense(l1, activation='relu')(hidden)
hidden3 = layers.Dense(480, activation='relu')(hidden3)
hidden3 = layers.Reshape((15, 32))(hidden3)
hidden3 = layers.Conv1D(32, 3, strides=2, activation="relu",
padding="same")(hidden3)
hidden3 = layers.UpSampling1D(2)(hidden3)
hidden3 = layers.Conv1D(4, 3, activation="relu", padding="same")(hidden3)
hidden3 = layers.UpSampling1D(2)(hidden3)
hidden3 = layers.Flatten()(hidden3)
output3 = layers.Dense(Se, activation='linear', name='sequences')(hidden3)
# Decoder branch of modality 4 - ip address embeddings
hidden4 = layers.Dense(l1, activation='relu')(hidden)
output4 = layers.Dense(I, activation='linear', name='ipaddress')(hidden4)
outputs = [output1, output2, output3, output4]
# Mean Squared Errors
loss = {'payload':'mse',
'statistics':'mse',
'sequences':'mse',
'ipaddress':'mse'}
# Balance losses
weights = {'payload':1/P,
'statistics':1/St,
'sequences':1/Se,
'ipaddress':1/I}
return inputs, outputs, loss, weights
def train_gridsearch(epochs: int, batch_size: int, K: int):
"""
Train a multi-modal autoencoder and deep classifiers for traffic
application classification. This training is the core of the gridsearch
in which we vary the size of the adaptaition size and the bottleneck
of the MAE. The deep classifiers are trained for validation
Parameters
----------
epochs : int
The number of epochs to use for training.
batch_size : int
The batch size to use for training.
K : int
The fold number to use for training and validation.
Returns
-------
None
The function saves the trained model and evaluation metrics to disk.
"""
#=============================================================
# Load features and stratified-k-folds order
#=============================================================
# Load ports word2vec embeddings - entity
payload=pd.read_csv(f'{FEATURES}/payload.csv', index_col=[0])
# Load statistics features - quantity
statistics=pd.read_csv(f'{FEATURES}/statistics.csv', index_col=[0])
# Load statistics sequences - quantity
sequences=pd.read_csv(f'{FEATURES}/sequences.csv', index_col=[0])
# Load ip address word2vec embeddings - entity
ipaddress=pd.read_csv(f'{FEATURES}/ipaddress.csv', index_col=[0])
# Merge the features as raw concatenation
concat = payload.reset_index().drop(columns=['label'])\
.merge(statistics.reset_index().drop(columns=['label']),
on='index', how='inner')\
.merge(sequences.reset_index().drop(columns=['label']),
on='index', how='inner')\
.merge(ipaddress.reset_index(), on='index', how='inner')\
.set_index('index')
# Load stratified k folds
kfolds = joblib.load(f'../data/task01/skfolds/folds.save')
# Get the number of classes
n_classes = ipaddress.value_counts('label').shape[0]
#=============================================================
# Manage training and validation dataset
#=============================================================
feature, fname = concat, 'mae'
# Get the features size. Last column is the label one
P,St,Se,I = (payload.shape[1]-1,
statistics.shape[1]-1,
sequences.shape[1]-1,
ipaddress.shape[1]-1)
#=============================================================
# Train the models varying sizes
#=============================================================
# Adaptation space size
for l1 in [32, 64, 128, 256]:
# Bottleneck size
for l4 in [32, 64, 128, 256]:
# Retrieve the training and validation samples from the k-folds
# order
X_train, X_val, y_train, y_val = get_datasets(kfolds, K, feature)
#=============================================================
# MAEs
#=============================================================
# Define the shapes and build the model
shapes = X_train.shape[1], P, St, Se, I
inputs, outputs, loss, weights = build_model(shapes, l1, l4)
# Initialize the classifier
mae = MultimodalAE(model_path=f'{MAE}/gridsearch_{l1}_{l4}_k{K}',
io=(inputs, outputs), losses=loss, weights=weights)
# Fit the multi-modal autoencoder
mae.fit(training_data=(X_train, X_train),
y_sizes=[P, St, Se, I],
batch_size=batch_size,
scale_data=True,
epochs=epochs,
validation_data=(X_val, X_val),
save=True,
verbose=1)
# Retrieve the best model
mae = MultimodalAE(model_path=f'{MAE}/gridsearch_{l1}_{l4}_k{K}',
_load_model=True)
#=============================================================
# Save the embeddings
#=============================================================
# Transform the dataset retrieveing the embeddings
embeddings = np.vstack([mae.transform(X_train),
mae.transform(X_val)])
# Save the embeddings
embeddings = pd.DataFrame(embeddings,
index=np.hstack(kfolds[K][:2]))
embeddings[['label']] = np.hstack(kfolds[K][2:]).reshape(-1, 1)
embeddings.to_csv(
f'../data/task01/embeddings/gridsearch_{l1}_{l4}_k{K}.csv')
# Load the pre-trained multimodal embeddings
embeddings=pd.read_csv(
f'../data/task01/embeddings/gridsearch_{l1}_{l4}_k{K}.csv',
index_col=[0])
#=============================================================
# Classifiers
#=============================================================
# Retrieve the training and validation samples from the k-folds
# order
X_train, X_val, y_train, y_val = get_datasets(kfolds, K, embeddings)
# Define the classifier architecture
inputs = layers.Input(X_train.shape[1],)
hidden = layers.Dense(512, activation='relu')(inputs)
hidden = layers.Dropout(.3)(hidden)
hidden = layers.Dense(256, activation='relu')(hidden)
hidden = layers.Dropout(.3)(hidden)
outputs = layers.Dense(n_classes, activation='softmax')(hidden)
# Initialize the classifier
mpath = f'../data/task01/classifiers/gridsearch_{l1}_{l4}_k{K}'
classifier = DeepClassifier(io=(inputs, outputs), model_path=mpath)
# Train the classifier. Standardize data before training
classifier.fit(training_data=(X_train, y_train),
validation_data=(X_val, y_val),
scale_data=True,
batch_size=batch_size,
epochs=epochs,
save=True)
if __name__ == "__main__":
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, required=True,
help="Number of epochs to train the model")
parser.add_argument("--batch_size", type=int, required=True,
help="Batch size to use during training")
parser.add_argument("--fold_number", type=int, required=True,
help="Fold number to use for training")
args = parser.parse_args()
# Train the multimodal autoencoder
train_gridsearch(args.epochs,
args.batch_size,
args.fold_number)