Skip to content

Commit bbb94a5

Browse files
Intro and conclusions
1 parent 1ab5b10 commit bbb94a5

12 files changed

+832
-252
lines changed

README.md

+3-28
Original file line numberDiff line numberDiff line change
@@ -195,37 +195,12 @@ If you use this code or data in your research, please cite our paper:
195195

196196
## **Todo**
197197
- Notebooks:
198-
- [x] Task01
199-
- [x] Codes
200-
- [x] Comments
201-
- [x] Markdowns
202-
- [x] Task02
203-
- [x] Codes
204-
- [x] Comments
205-
- [x] Intro-Markdowns
206-
- [x] Task03
207-
- [x] Codes
208-
- [x] Comments
209-
- [x] Intro-Markdowns
210-
- [x] GridSearch
211-
- [x] Codes
212-
- [x] Comments
213-
- [x] Intro-Markdowns
214198
- [ ] Finalization and visualization
215-
- [ ] Starting datasets and characterization
216-
- [x] Datasets
217-
- [ ] Dataset balancing
218-
- [ ] Markdowns
219-
- [ ] Comments and functions
220-
- [ ] Stratified k fold
221199
- [ ] t-SNE - task01
222-
- Scripts
223-
- [x] Training task01
224-
- [x] Training task02
225-
- [x] Training task03
226-
- [x] Training GridSearch
227200
- Documentation
228201
- [x] README
229-
- [x] Raw data folder on cluster
202+
- [ ] Notebooks
203+
- [ ] Starting datasets and characterization
204+
- [ ] Finalization and visualization
230205
- [ ] References to the datasets and papers
231206
- [ ] More detailed instruction on how to download and setup experiments (once agreed)

notebooks/00-characterization.ipynb

+55-129
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,30 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 392,
13+
"execution_count": 1,
1414
"id": "09238324",
1515
"metadata": {},
16-
"outputs": [],
16+
"outputs": [
17+
{
18+
"name": "stderr",
19+
"output_type": "stream",
20+
"text": [
21+
"2023-01-09 10:08:37.637796: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA\n",
22+
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
23+
]
24+
}
25+
],
1726
"source": [
1827
"# Make mltoolbox and utls reachable from this folder\n",
1928
"import sys\n",
2029
"sys.path.append('../')\n",
2130
"\n",
2231
"from mltoolbox.representation import iWord2Vec\n",
32+
"from utils import get_balance\n",
2333
"import pandas as pd\n",
2434
"import joblib\n",
2535
"\n",
26-
"DEMO = True"
36+
"DEMO = False"
2737
]
2838
},
2939
{
@@ -38,25 +48,25 @@
3848
},
3949
{
4050
"cell_type": "code",
41-
"execution_count": 47,
51+
"execution_count": 3,
4252
"id": "c550eae3",
4353
"metadata": {},
4454
"outputs": [
4555
{
46-
"data": {
47-
"text/plain": [
48-
"(44045, 234)"
49-
]
50-
},
51-
"execution_count": 47,
52-
"metadata": {},
53-
"output_type": "execute_result"
56+
"name": "stdout",
57+
"output_type": "stream",
58+
"text": [
59+
"Dataset shape: (44045, 234)\n",
60+
"Dataset balance: 0.94\n"
61+
]
5462
}
5563
],
5664
"source": [
5765
"dataset = pd.read_csv('../data/task01/raw_data/mirage.csv', index_col=[0])\n",
66+
"balance = get_balance(dataset)\n",
5867
"\n",
59-
"dataset.shape"
68+
"print(f'Dataset shape: {dataset.shape}')\n",
69+
"print(f'Dataset balance: {round(balance, 2)}')"
6070
]
6171
},
6272
{
@@ -333,38 +343,6 @@
333343
"embeddings.head(3)"
334344
]
335345
},
336-
{
337-
"cell_type": "markdown",
338-
"id": "833d229e",
339-
"metadata": {},
340-
"source": [
341-
"### Stratified k-fold"
342-
]
343-
},
344-
{
345-
"cell_type": "code",
346-
"execution_count": 50,
347-
"id": "28f62bfa",
348-
"metadata": {},
349-
"outputs": [
350-
{
351-
"data": {
352-
"text/plain": [
353-
"5"
354-
]
355-
},
356-
"execution_count": 50,
357-
"metadata": {},
358-
"output_type": "execute_result"
359-
}
360-
],
361-
"source": [
362-
"# Load stratified k folds\n",
363-
"kfolds = joblib.load(f'../data/task01/skfolds/folds.save')\n",
364-
"\n",
365-
"len(kfolds)"
366-
]
367-
},
368346
{
369347
"cell_type": "markdown",
370348
"id": "580a730b-e6ea-4be1-9d2f-d54475fd3157",
@@ -375,24 +353,25 @@
375353
},
376354
{
377355
"cell_type": "code",
378-
"execution_count": 393,
356+
"execution_count": 5,
379357
"id": "96a7deaf-41a4-4313-a812-5b03d38046c3",
380358
"metadata": {},
381359
"outputs": [
382360
{
383-
"data": {
384-
"text/plain": [
385-
"(10460, 46)"
386-
]
387-
},
388-
"execution_count": 393,
389-
"metadata": {},
390-
"output_type": "execute_result"
361+
"name": "stdout",
362+
"output_type": "stream",
363+
"text": [
364+
"Dataset shape: (10460, 46)\n",
365+
"Dataset balance: 0.38\n"
366+
]
391367
}
392368
],
393369
"source": [
394370
"statistics = pd.read_csv('../data/task02/features/statistics.csv', index_col=[0])\n",
395-
"statistics.shape"
371+
"balance = get_balance(statistics)\n",
372+
"\n",
373+
"print(f'Dataset shape: {statistics.shape}')\n",
374+
"print(f'Dataset balance: {round(balance, 2)}')"
396375
]
397376
},
398377
{
@@ -437,18 +416,25 @@
437416
"word2vec.train(corpus)\n",
438417
"# Update the progress bar object and set the postfix message\n",
439418
"pbar.update(1)\n",
419+
"\n",
440420
"for key in keys[1:]:\n",
441421
" corpus = [x.split(',') for x in _corpus[key].split('\\n')]\n",
442422
" # Update the pre-trained model on the current day\n",
443423
" word2vec.update(corpus)\n",
444424
" # Update the progress bar object and set the postfix message\n",
445425
" pbar.update(1)\n",
426+
"\n",
446427
"# Close the progressbar\n",
447428
"pbar.close()\n",
429+
"\n",
448430
"# Retrieve the final updated embeddings\n",
449431
"embeddings = word2vec.get_embeddings()\n",
450432
"embeddings = embeddings.reindex(statistics.index)\n",
451433
"embeddings['label'] = statistics.label\n",
434+
"\n",
435+
"print(embeddings.shape) # Get the vocabulary size and the embeddings size\n",
436+
"embeddings.head(3)\n",
437+
"\n",
452438
"if not DEMO:\n",
453439
" embeddings.to_csv('../data/task02/features/ipaddress.csv')"
454440
]
@@ -495,12 +481,14 @@
495481
"word2vec.train(corpus)\n",
496482
"# Update the progress bar object and set the postfix message\n",
497483
"pbar.update(1)\n",
484+
"\n",
498485
"for key in keys[1:]:\n",
499486
" corpus = [x.split(',') for x in _corpus[key].split('\\n')]\n",
500487
" # Update the pre-trained model on the current day\n",
501488
" word2vec.update(corpus)\n",
502489
" # Update the progress bar object and set the postfix message\n",
503490
" pbar.update(1)\n",
491+
"\n",
504492
"# Close the progressbar\n",
505493
"pbar.close()\n",
506494
"# Retrieve the final updated embeddings\n",
@@ -534,42 +522,12 @@
534522
"ports_embeddings = pd.DataFrame(ports_embeddings).rename(columns={0:'index'}).set_index('index').reindex(statistics.index)\n",
535523
"ports_embeddings['label'] = statistics.label\n",
536524
"\n",
525+
"print(ports_embeddings.shape) # Get the vocabulary size and the embeddings size\n",
526+
"ports_embeddings.head(3)\n",
537527
"if not DEMO:\n",
538528
" ports_embeddings.to_csv('../data/task02/features/ports.csv')"
539529
]
540530
},
541-
{
542-
"cell_type": "markdown",
543-
"id": "302a5af4-ef36-43e0-add1-cc89211401e7",
544-
"metadata": {},
545-
"source": [
546-
"### Stratified k-fold **REDO**"
547-
]
548-
},
549-
{
550-
"cell_type": "code",
551-
"execution_count": 399,
552-
"id": "f798d6ae-cb5f-4a0b-a858-a619f380161d",
553-
"metadata": {},
554-
"outputs": [
555-
{
556-
"data": {
557-
"text/plain": [
558-
"5"
559-
]
560-
},
561-
"execution_count": 399,
562-
"metadata": {},
563-
"output_type": "execute_result"
564-
}
565-
],
566-
"source": [
567-
"# Load stratified k folds\n",
568-
"kfolds = joblib.load(f'../data/task02/skfolds/folds.save')\n",
569-
"\n",
570-
"len(kfolds)"
571-
]
572-
},
573531
{
574532
"cell_type": "markdown",
575533
"id": "2ce46437-ea67-4b1d-829b-70b41b957028",
@@ -582,25 +540,25 @@
582540
},
583541
{
584542
"cell_type": "code",
585-
"execution_count": 51,
543+
"execution_count": 6,
586544
"id": "d56a19c7-3951-404a-919c-dec7df2a57fd",
587545
"metadata": {},
588546
"outputs": [
589547
{
590-
"data": {
591-
"text/plain": [
592-
"(609, 234)"
593-
]
594-
},
595-
"execution_count": 51,
596-
"metadata": {},
597-
"output_type": "execute_result"
548+
"name": "stdout",
549+
"output_type": "stream",
550+
"text": [
551+
"Dataset shape: (609, 234)\n",
552+
"Dataset balance: 0.82\n"
553+
]
598554
}
599555
],
600556
"source": [
601557
"dataset = pd.read_csv('../data/task03/raw_data/iscxvpn2016.csv', index_col=[0])\n",
558+
"balance = get_balance(dataset)\n",
602559
"\n",
603-
"dataset.shape"
560+
"print(f'Dataset shape: {dataset.shape}')\n",
561+
"print(f'Dataset balance: {round(balance, 2)}')"
604562
]
605563
},
606564
{
@@ -864,38 +822,6 @@
864822
"print(embeddings.shape) # Get the vocabulary size and the embeddings size\n",
865823
"embeddings.head(3)"
866824
]
867-
},
868-
{
869-
"cell_type": "markdown",
870-
"id": "55c3e8a3-811b-4727-9eb4-447fe5487fe5",
871-
"metadata": {},
872-
"source": [
873-
"### Stratified k-fold"
874-
]
875-
},
876-
{
877-
"cell_type": "code",
878-
"execution_count": 55,
879-
"id": "3b4dac46-3e21-451a-ba1f-c310f5712979",
880-
"metadata": {},
881-
"outputs": [
882-
{
883-
"data": {
884-
"text/plain": [
885-
"5"
886-
]
887-
},
888-
"execution_count": 55,
889-
"metadata": {},
890-
"output_type": "execute_result"
891-
}
892-
],
893-
"source": [
894-
"# Load stratified k folds\n",
895-
"kfolds = joblib.load(f'../data/task03/skfolds/folds.save')\n",
896-
"\n",
897-
"len(kfolds)"
898-
]
899825
}
900826
],
901827
"metadata": {

0 commit comments

Comments
 (0)