|
10 | 10 | },
|
11 | 11 | {
|
12 | 12 | "cell_type": "code",
|
13 |
| - "execution_count": 392, |
| 13 | + "execution_count": 1, |
14 | 14 | "id": "09238324",
|
15 | 15 | "metadata": {},
|
16 |
| - "outputs": [], |
| 16 | + "outputs": [ |
| 17 | + { |
| 18 | + "name": "stderr", |
| 19 | + "output_type": "stream", |
| 20 | + "text": [ |
| 21 | + "2023-01-09 10:08:37.637796: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA\n", |
| 22 | + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" |
| 23 | + ] |
| 24 | + } |
| 25 | + ], |
17 | 26 | "source": [
|
18 | 27 | "# Make mltoolbox and utls reachable from this folder\n",
|
19 | 28 | "import sys\n",
|
20 | 29 | "sys.path.append('../')\n",
|
21 | 30 | "\n",
|
22 | 31 | "from mltoolbox.representation import iWord2Vec\n",
|
| 32 | + "from utils import get_balance\n", |
23 | 33 | "import pandas as pd\n",
|
24 | 34 | "import joblib\n",
|
25 | 35 | "\n",
|
26 |
| - "DEMO = True" |
| 36 | + "DEMO = False" |
27 | 37 | ]
|
28 | 38 | },
|
29 | 39 | {
|
|
38 | 48 | },
|
39 | 49 | {
|
40 | 50 | "cell_type": "code",
|
41 |
| - "execution_count": 47, |
| 51 | + "execution_count": 3, |
42 | 52 | "id": "c550eae3",
|
43 | 53 | "metadata": {},
|
44 | 54 | "outputs": [
|
45 | 55 | {
|
46 |
| - "data": { |
47 |
| - "text/plain": [ |
48 |
| - "(44045, 234)" |
49 |
| - ] |
50 |
| - }, |
51 |
| - "execution_count": 47, |
52 |
| - "metadata": {}, |
53 |
| - "output_type": "execute_result" |
| 56 | + "name": "stdout", |
| 57 | + "output_type": "stream", |
| 58 | + "text": [ |
| 59 | + "Dataset shape: (44045, 234)\n", |
| 60 | + "Dataset balance: 0.94\n" |
| 61 | + ] |
54 | 62 | }
|
55 | 63 | ],
|
56 | 64 | "source": [
|
57 | 65 | "dataset = pd.read_csv('../data/task01/raw_data/mirage.csv', index_col=[0])\n",
|
| 66 | + "balance = get_balance(dataset)\n", |
58 | 67 | "\n",
|
59 |
| - "dataset.shape" |
| 68 | + "print(f'Dataset shape: {dataset.shape}')\n", |
| 69 | + "print(f'Dataset balance: {round(balance, 2)}')" |
60 | 70 | ]
|
61 | 71 | },
|
62 | 72 | {
|
|
333 | 343 | "embeddings.head(3)"
|
334 | 344 | ]
|
335 | 345 | },
|
336 |
| - { |
337 |
| - "cell_type": "markdown", |
338 |
| - "id": "833d229e", |
339 |
| - "metadata": {}, |
340 |
| - "source": [ |
341 |
| - "### Stratified k-fold" |
342 |
| - ] |
343 |
| - }, |
344 |
| - { |
345 |
| - "cell_type": "code", |
346 |
| - "execution_count": 50, |
347 |
| - "id": "28f62bfa", |
348 |
| - "metadata": {}, |
349 |
| - "outputs": [ |
350 |
| - { |
351 |
| - "data": { |
352 |
| - "text/plain": [ |
353 |
| - "5" |
354 |
| - ] |
355 |
| - }, |
356 |
| - "execution_count": 50, |
357 |
| - "metadata": {}, |
358 |
| - "output_type": "execute_result" |
359 |
| - } |
360 |
| - ], |
361 |
| - "source": [ |
362 |
| - "# Load stratified k folds\n", |
363 |
| - "kfolds = joblib.load(f'../data/task01/skfolds/folds.save')\n", |
364 |
| - "\n", |
365 |
| - "len(kfolds)" |
366 |
| - ] |
367 |
| - }, |
368 | 346 | {
|
369 | 347 | "cell_type": "markdown",
|
370 | 348 | "id": "580a730b-e6ea-4be1-9d2f-d54475fd3157",
|
|
375 | 353 | },
|
376 | 354 | {
|
377 | 355 | "cell_type": "code",
|
378 |
| - "execution_count": 393, |
| 356 | + "execution_count": 5, |
379 | 357 | "id": "96a7deaf-41a4-4313-a812-5b03d38046c3",
|
380 | 358 | "metadata": {},
|
381 | 359 | "outputs": [
|
382 | 360 | {
|
383 |
| - "data": { |
384 |
| - "text/plain": [ |
385 |
| - "(10460, 46)" |
386 |
| - ] |
387 |
| - }, |
388 |
| - "execution_count": 393, |
389 |
| - "metadata": {}, |
390 |
| - "output_type": "execute_result" |
| 361 | + "name": "stdout", |
| 362 | + "output_type": "stream", |
| 363 | + "text": [ |
| 364 | + "Dataset shape: (10460, 46)\n", |
| 365 | + "Dataset balance: 0.38\n" |
| 366 | + ] |
391 | 367 | }
|
392 | 368 | ],
|
393 | 369 | "source": [
|
394 | 370 | "statistics = pd.read_csv('../data/task02/features/statistics.csv', index_col=[0])\n",
|
395 |
| - "statistics.shape" |
| 371 | + "balance = get_balance(statistics)\n", |
| 372 | + "\n", |
| 373 | + "print(f'Dataset shape: {statistics.shape}')\n", |
| 374 | + "print(f'Dataset balance: {round(balance, 2)}')" |
396 | 375 | ]
|
397 | 376 | },
|
398 | 377 | {
|
|
437 | 416 | "word2vec.train(corpus)\n",
|
438 | 417 | "# Update the progress bar object and set the postfix message\n",
|
439 | 418 | "pbar.update(1)\n",
|
| 419 | + "\n", |
440 | 420 | "for key in keys[1:]:\n",
|
441 | 421 | " corpus = [x.split(',') for x in _corpus[key].split('\\n')]\n",
|
442 | 422 | " # Update the pre-trained model on the current day\n",
|
443 | 423 | " word2vec.update(corpus)\n",
|
444 | 424 | " # Update the progress bar object and set the postfix message\n",
|
445 | 425 | " pbar.update(1)\n",
|
| 426 | + "\n", |
446 | 427 | "# Close the progressbar\n",
|
447 | 428 | "pbar.close()\n",
|
| 429 | + "\n", |
448 | 430 | "# Retrieve the final updated embeddings\n",
|
449 | 431 | "embeddings = word2vec.get_embeddings()\n",
|
450 | 432 | "embeddings = embeddings.reindex(statistics.index)\n",
|
451 | 433 | "embeddings['label'] = statistics.label\n",
|
| 434 | + "\n", |
| 435 | + "print(embeddings.shape) # Get the vocabulary size and the embeddings size\n", |
| 436 | + "embeddings.head(3)\n", |
| 437 | + "\n", |
452 | 438 | "if not DEMO:\n",
|
453 | 439 | " embeddings.to_csv('../data/task02/features/ipaddress.csv')"
|
454 | 440 | ]
|
|
495 | 481 | "word2vec.train(corpus)\n",
|
496 | 482 | "# Update the progress bar object and set the postfix message\n",
|
497 | 483 | "pbar.update(1)\n",
|
| 484 | + "\n", |
498 | 485 | "for key in keys[1:]:\n",
|
499 | 486 | " corpus = [x.split(',') for x in _corpus[key].split('\\n')]\n",
|
500 | 487 | " # Update the pre-trained model on the current day\n",
|
501 | 488 | " word2vec.update(corpus)\n",
|
502 | 489 | " # Update the progress bar object and set the postfix message\n",
|
503 | 490 | " pbar.update(1)\n",
|
| 491 | + "\n", |
504 | 492 | "# Close the progressbar\n",
|
505 | 493 | "pbar.close()\n",
|
506 | 494 | "# Retrieve the final updated embeddings\n",
|
|
534 | 522 | "ports_embeddings = pd.DataFrame(ports_embeddings).rename(columns={0:'index'}).set_index('index').reindex(statistics.index)\n",
|
535 | 523 | "ports_embeddings['label'] = statistics.label\n",
|
536 | 524 | "\n",
|
| 525 | + "print(ports_embeddings.shape) # Get the vocabulary size and the embeddings size\n", |
| 526 | + "ports_embeddings.head(3)\n", |
537 | 527 | "if not DEMO:\n",
|
538 | 528 | " ports_embeddings.to_csv('../data/task02/features/ports.csv')"
|
539 | 529 | ]
|
540 | 530 | },
|
541 |
| - { |
542 |
| - "cell_type": "markdown", |
543 |
| - "id": "302a5af4-ef36-43e0-add1-cc89211401e7", |
544 |
| - "metadata": {}, |
545 |
| - "source": [ |
546 |
| - "### Stratified k-fold **REDO**" |
547 |
| - ] |
548 |
| - }, |
549 |
| - { |
550 |
| - "cell_type": "code", |
551 |
| - "execution_count": 399, |
552 |
| - "id": "f798d6ae-cb5f-4a0b-a858-a619f380161d", |
553 |
| - "metadata": {}, |
554 |
| - "outputs": [ |
555 |
| - { |
556 |
| - "data": { |
557 |
| - "text/plain": [ |
558 |
| - "5" |
559 |
| - ] |
560 |
| - }, |
561 |
| - "execution_count": 399, |
562 |
| - "metadata": {}, |
563 |
| - "output_type": "execute_result" |
564 |
| - } |
565 |
| - ], |
566 |
| - "source": [ |
567 |
| - "# Load stratified k folds\n", |
568 |
| - "kfolds = joblib.load(f'../data/task02/skfolds/folds.save')\n", |
569 |
| - "\n", |
570 |
| - "len(kfolds)" |
571 |
| - ] |
572 |
| - }, |
573 | 531 | {
|
574 | 532 | "cell_type": "markdown",
|
575 | 533 | "id": "2ce46437-ea67-4b1d-829b-70b41b957028",
|
|
582 | 540 | },
|
583 | 541 | {
|
584 | 542 | "cell_type": "code",
|
585 |
| - "execution_count": 51, |
| 543 | + "execution_count": 6, |
586 | 544 | "id": "d56a19c7-3951-404a-919c-dec7df2a57fd",
|
587 | 545 | "metadata": {},
|
588 | 546 | "outputs": [
|
589 | 547 | {
|
590 |
| - "data": { |
591 |
| - "text/plain": [ |
592 |
| - "(609, 234)" |
593 |
| - ] |
594 |
| - }, |
595 |
| - "execution_count": 51, |
596 |
| - "metadata": {}, |
597 |
| - "output_type": "execute_result" |
| 548 | + "name": "stdout", |
| 549 | + "output_type": "stream", |
| 550 | + "text": [ |
| 551 | + "Dataset shape: (609, 234)\n", |
| 552 | + "Dataset balance: 0.82\n" |
| 553 | + ] |
598 | 554 | }
|
599 | 555 | ],
|
600 | 556 | "source": [
|
601 | 557 | "dataset = pd.read_csv('../data/task03/raw_data/iscxvpn2016.csv', index_col=[0])\n",
|
| 558 | + "balance = get_balance(dataset)\n", |
602 | 559 | "\n",
|
603 |
| - "dataset.shape" |
| 560 | + "print(f'Dataset shape: {dataset.shape}')\n", |
| 561 | + "print(f'Dataset balance: {round(balance, 2)}')" |
604 | 562 | ]
|
605 | 563 | },
|
606 | 564 | {
|
|
864 | 822 | "print(embeddings.shape) # Get the vocabulary size and the embeddings size\n",
|
865 | 823 | "embeddings.head(3)"
|
866 | 824 | ]
|
867 |
| - }, |
868 |
| - { |
869 |
| - "cell_type": "markdown", |
870 |
| - "id": "55c3e8a3-811b-4727-9eb4-447fe5487fe5", |
871 |
| - "metadata": {}, |
872 |
| - "source": [ |
873 |
| - "### Stratified k-fold" |
874 |
| - ] |
875 |
| - }, |
876 |
| - { |
877 |
| - "cell_type": "code", |
878 |
| - "execution_count": 55, |
879 |
| - "id": "3b4dac46-3e21-451a-ba1f-c310f5712979", |
880 |
| - "metadata": {}, |
881 |
| - "outputs": [ |
882 |
| - { |
883 |
| - "data": { |
884 |
| - "text/plain": [ |
885 |
| - "5" |
886 |
| - ] |
887 |
| - }, |
888 |
| - "execution_count": 55, |
889 |
| - "metadata": {}, |
890 |
| - "output_type": "execute_result" |
891 |
| - } |
892 |
| - ], |
893 |
| - "source": [ |
894 |
| - "# Load stratified k folds\n", |
895 |
| - "kfolds = joblib.load(f'../data/task03/skfolds/folds.save')\n", |
896 |
| - "\n", |
897 |
| - "len(kfolds)" |
898 |
| - ] |
899 | 825 | }
|
900 | 826 | ],
|
901 | 827 | "metadata": {
|
|
0 commit comments