SmartData-Polito
diff --git a/‎README.md
+3-28 b/‎README.md
+3-28
diff --git a/‎notebooks/00-characterization.ipynb
+55-129 b/‎notebooks/00-characterization.ipynb
+55-129
@@ -195,37 +195,12 @@ If you use this code or data in your research, please cite our paper:
 
 ## **Todo**
 - Notebooks:
-    - [x] Task01
-        - [x] Codes
-        - [x] Comments
-        - [x] Markdowns
-    - [x] Task02
-        - [x] Codes
-        - [x] Comments
-        - [x] Intro-Markdowns
-    - [x] Task03
-        - [x] Codes
-        - [x] Comments
-        - [x] Intro-Markdowns
-    - [x] GridSearch
-        - [x] Codes
-        - [x] Comments
-        - [x] Intro-Markdowns
     - [ ] Finalization and visualization
-    - [ ] Starting datasets and characterization
-        - [x] Datasets
-        - [ ] Dataset balancing
-        - [ ] Markdowns
-        - [ ] Comments and functions
-        - [ ] Stratified k fold
         - [ ] t-SNE - task01
-- Scripts
-    - [x] Training task01
-    - [x] Training task02
-    - [x] Training task03
-    - [x] Training GridSearch
 - Documentation
     - [x] README
-    - [x] Raw data folder on cluster
+    - [ ] Notebooks
+        - [ ] Starting datasets and characterization
+        - [ ] Finalization and visualization
     - [ ] References to the datasets and papers
     - [ ] More detailed instruction on how to download and setup experiments (once agreed)
@@ -10,20 +10,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 392,
+   "execution_count": 1,
    "id": "09238324",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-01-09 10:08:37.637796: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
    "source": [
     "# Make mltoolbox and utls reachable from this folder\n",
     "import sys\n",
     "sys.path.append('../')\n",
     "\n",
     "from mltoolbox.representation import iWord2Vec\n",
+    "from utils import get_balance\n",
     "import pandas as pd\n",
     "import joblib\n",
     "\n",
-    "DEMO = True"
+    "DEMO = False"
    ]
   },
   {
@@ -38,25 +48,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 3,
    "id": "c550eae3",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "(44045, 234)"
-      ]
-     },
-     "execution_count": 47,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset shape: (44045, 234)\n",
+      "Dataset balance: 0.94\n"
+     ]
     }
    ],
    "source": [
     "dataset = pd.read_csv('../data/task01/raw_data/mirage.csv', index_col=[0])\n",
+    "balance = get_balance(dataset)\n",
     "\n",
-    "dataset.shape"
+    "print(f'Dataset shape: {dataset.shape}')\n",
+    "print(f'Dataset balance: {round(balance, 2)}')"
    ]
   },
   {
@@ -333,38 +343,6 @@
     "embeddings.head(3)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "833d229e",
-   "metadata": {},
-   "source": [
-    "### Stratified k-fold"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "28f62bfa",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "5"
-      ]
-     },
-     "execution_count": 50,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Load stratified k folds\n",
-    "kfolds = joblib.load(f'../data/task01/skfolds/folds.save')\n",
-    "\n",
-    "len(kfolds)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "580a730b-e6ea-4be1-9d2f-d54475fd3157",
@@ -375,24 +353,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 393,
+   "execution_count": 5,
    "id": "96a7deaf-41a4-4313-a812-5b03d38046c3",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "(10460, 46)"
-      ]
-     },
-     "execution_count": 393,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset shape: (10460, 46)\n",
+      "Dataset balance: 0.38\n"
+     ]
     }
    ],
    "source": [
     "statistics = pd.read_csv('../data/task02/features/statistics.csv', index_col=[0])\n",
-    "statistics.shape"
+    "balance = get_balance(statistics)\n",
+    "\n",
+    "print(f'Dataset shape: {statistics.shape}')\n",
+    "print(f'Dataset balance: {round(balance, 2)}')"
    ]
   },
   {
@@ -437,18 +416,25 @@
     "word2vec.train(corpus)\n",
     "# Update the progress bar object and set the postfix message\n",
     "pbar.update(1)\n",
+    "\n",
     "for key in keys[1:]:\n",
     "    corpus = [x.split(',') for x in _corpus[key].split('\\n')]\n",
     "    # Update the pre-trained model on the current day\n",
     "    word2vec.update(corpus)\n",
     "    # Update the progress bar object and set the postfix message\n",
     "    pbar.update(1)\n",
+    "\n",
     "# Close the progressbar\n",
     "pbar.close()\n",
+    "\n",
     "# Retrieve the final updated embeddings\n",
     "embeddings = word2vec.get_embeddings()\n",
     "embeddings = embeddings.reindex(statistics.index)\n",
     "embeddings['label'] = statistics.label\n",
+    "\n",
+    "print(embeddings.shape) # Get the vocabulary size and the embeddings size\n",
+    "embeddings.head(3)\n",
+    "\n",
     "if not DEMO:\n",
     "    embeddings.to_csv('../data/task02/features/ipaddress.csv')"
    ]
@@ -495,12 +481,14 @@
     "word2vec.train(corpus)\n",
     "# Update the progress bar object and set the postfix message\n",
     "pbar.update(1)\n",
+    "\n",
     "for key in keys[1:]:\n",
     "    corpus = [x.split(',') for x in _corpus[key].split('\\n')]\n",
     "    # Update the pre-trained model on the current day\n",
     "    word2vec.update(corpus)\n",
     "    # Update the progress bar object and set the postfix message\n",
     "    pbar.update(1)\n",
+    "\n",
     "# Close the progressbar\n",
     "pbar.close()\n",
     "# Retrieve the final updated embeddings\n",
@@ -534,42 +522,12 @@
     "ports_embeddings = pd.DataFrame(ports_embeddings).rename(columns={0:'index'}).set_index('index').reindex(statistics.index)\n",
     "ports_embeddings['label'] = statistics.label\n",
     "\n",
+    "print(ports_embeddings.shape) # Get the vocabulary size and the embeddings size\n",
+    "ports_embeddings.head(3)\n",
     "if not DEMO:\n",
     "    ports_embeddings.to_csv('../data/task02/features/ports.csv')"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "302a5af4-ef36-43e0-add1-cc89211401e7",
-   "metadata": {},
-   "source": [
-    "### Stratified k-fold **REDO**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 399,
-   "id": "f798d6ae-cb5f-4a0b-a858-a619f380161d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "5"
-      ]
-     },
-     "execution_count": 399,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Load stratified k folds\n",
-    "kfolds = joblib.load(f'../data/task02/skfolds/folds.save')\n",
-    "\n",
-    "len(kfolds)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "2ce46437-ea67-4b1d-829b-70b41b957028",
@@ -582,25 +540,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 6,
    "id": "d56a19c7-3951-404a-919c-dec7df2a57fd",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "(609, 234)"
-      ]
-     },
-     "execution_count": 51,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset shape: (609, 234)\n",
+      "Dataset balance: 0.82\n"
+     ]
     }
    ],
    "source": [
     "dataset = pd.read_csv('../data/task03/raw_data/iscxvpn2016.csv', index_col=[0])\n",
+    "balance = get_balance(dataset)\n",
     "\n",
-    "dataset.shape"
+    "print(f'Dataset shape: {dataset.shape}')\n",
+    "print(f'Dataset balance: {round(balance, 2)}')"
    ]
   },
   {
@@ -864,38 +822,6 @@
     "print(embeddings.shape) # Get the vocabulary size and the embeddings size\n",
     "embeddings.head(3)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "55c3e8a3-811b-4727-9eb4-447fe5487fe5",
-   "metadata": {},
-   "source": [
-    "### Stratified k-fold"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 55,
-   "id": "3b4dac46-3e21-451a-ba1f-c310f5712979",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "5"
-      ]
-     },
-     "execution_count": 55,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Load stratified k folds\n",
-    "kfolds = joblib.load(f'../data/task03/skfolds/folds.save')\n",
-    "\n",
-    "len(kfolds)"
-   ]
   }
  ],
  "metadata": {