From 3070c023f49abc6fce93af36a6740cd8034a09ca Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 001/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 155 ++++++++++++++++++++-
 1 file changed, 152 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index b3e98c5150..124ec61f91 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -5,6 +5,7 @@
 import json
 import datetime
 import traceback
+import sys
 
 from slips_files.common.imports import *
 from slips_files.core.evidence_structure.evidence import (
@@ -112,6 +113,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -124,14 +260,18 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # For now, discard the ports
+            # For now, discard these
             to_drop = [
                 "appproto",
                 "daddr",
                 "saddr",
                 "ts",
                 "origstate",
-                "flow_type",
+                "type_",
+                "dir_",
+                "history",
+                "dbytes",
+                "dpkts",
                 "smac",
                 "dmac",
             ]
@@ -141,13 +281,22 @@ def process_features(self, dataset):
                 except ValueError:
                     pass
 
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # So transform here
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
+
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
-                r"(^.*NotEstablished.*$)", "0", regex=True
+                r"(^.*Not Established.*$)", "0", regex=True
             )
             dataset.state = dataset.state.str.replace(
                 r"(^.*Established.*$)", "1", regex=True
             )
+
+            # Convert categories to floats
             dataset.state = dataset.state.astype("float64")
 
             # Convert proto to categorical. For now we only have few states, so we can hardcode...

From df6e9196532d0ba050f4922a36ed1d2b1a2638b5 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 002/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 1497681f7a9b2f7d20a1c1e570646ca3b2c2bdbc Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 003/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 2eeb3ceb889625d179e07beb5e01e589d553ccf2 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 004/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 169 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 .../core/database/redis_db/profile_handler.py | 169 ++----------------
 .../core/database/sqlite_db/database.py       |   6 +-
 4 files changed, 41 insertions(+), 306 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 124ec61f91..c57a7a3581 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -5,9 +5,13 @@
 import json
 import datetime
 import traceback
-import sys
+import warnings
+
 
-from slips_files.common.imports import *
+from slips_files.common.state_handler import get_final_state_from_flags
+from slips_files.common.parsers.config_parser import ConfigParser
+from slips_files.common.slips_utils import utils
+from slips_files.common.abstracts.module import IModule
 from slips_files.core.evidence_structure.evidence import (
     Evidence,
     ProfileID,
@@ -29,8 +33,6 @@ def warn(*args, **kwargs):
     pass
 
 
-import warnings
-
 warnings.warn = warn
 
 
@@ -113,141 +115,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -281,12 +148,17 @@ def process_features(self, dataset):
                 except ValueError:
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -370,7 +242,7 @@ def process_flows(self):
         try:
             # We get all the flows so far
             # because this retraining happens in batches
-            flows = self.db.get_all_flows()
+            flows: list = self.db.get_all_flows()
 
             # Check how many different labels are in the DB
             # We need both normal and malware
@@ -464,7 +336,7 @@ def detect(self):
         """
         try:
             # Store the real label if there is one
-            y_flow = self.flow["label"]
+            # y_flow = self.flow["label"]
             # remove the real label column
             self.flow = self.flow.drop("label", axis=1)
             # remove the label predictions column of the other modules
@@ -568,13 +440,10 @@ def pre_main(self):
     def main(self):
         if msg := self.get_msg("new_flow"):
             data = msg["data"]
-            # Convert from json to dict
             data = json.loads(data)
-            profileid = data["profileid"]
+            # profileid = data["profileid"]
             twid = data["twid"]
-            # Get flow that is now in json format
             flow = data["flow"]
-            # Convert flow to a dict
             flow = json.loads(flow)
             # Convert the common fields to something that can
             # be interpreted
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index fe7b02d046..f1ef1290c7 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -569,9 +569,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 
diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index d785b51c98..23c23d3d42 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -14,7 +14,7 @@
 
 import redis
 import validators
-
+from slips_files.common.state_handler import get_final_state_from_flags
 from slips_files.common.abstracts.observer import IObservable
 from slips_files.core.output import Output
 
@@ -324,14 +324,15 @@ def add_port(
         state_hist = flow.state_hist if hasattr(flow, "state_hist") else ""
 
         if "^" in state_hist:
-            # The majority of the FP with horizontal port scan detection happen because a
-            # benign computer changes wifi, and many not established conns are redone,
-            # which look like a port scan to 10 webpages. To avoid this, we IGNORE all
-            # the flows that have in the history of flags (field history in zeek), the ^,
+            # The majority of the FP with horizontal port scan detection
+            # happen because a benign computer changes wifi, and many not
+            # established conns are redone, which look like a port scan to
+            # 10 webpages. To avoid this, we IGNORE all the flows that have
+            # in the history of flags (field history in zeek), the ^,
             # that means that the flow was swapped/flipped.
-            # The below key_name is only used by the portscan module to check for horizontal
-            # portscan, which means we can safely ignore it here and it won't affect the rest
-            # of slips
+            # The below key_name is only used by the portscan module to
+            # check for horizontal portscan, which means we can safely
+            # ignore it here and it won't affect the rest  of slips
             return False
 
         # Choose which port to use based if we were asked Dst or Src
@@ -342,10 +343,10 @@ def add_port(
         ip_key = "srcips" if role == "Server" else "dstips"
 
         # Get the state. Established, NotEstablished
-        summaryState = self.get_final_state_from_flags(state, pkts)
+        state = get_final_state_from_flags(state, pkts)
 
         old_profileid_twid_data = self.get_data_from_profile_tw(
-            profileid, twid, port_type, summaryState, proto, role, "Ports"
+            profileid, twid, port_type, state, proto, role, "Ports"
         )
 
         try:
@@ -355,7 +356,8 @@ def add_port(
             port_data["totalpkt"] += pkts
             port_data["totalbytes"] += totbytes
 
-            # if there's a conn from this ip on this port, update the pkts of this conn
+            # if there's a conn from this ip on this port, update the pkts
+            # of this conn
             if ip in port_data[ip_key]:
                 port_data[ip_key][ip]["pkts"] += pkts
                 port_data[ip_key][ip]["spkts"] += spkts
@@ -386,145 +388,10 @@ def add_port(
         old_profileid_twid_data[port] = port_data
         data = json.dumps(old_profileid_twid_data)
         hash_key = f"{profileid}{self.separator}{twid}"
-        key_name = f"{port_type}Ports{role}{proto}{summaryState}"
+        key_name = f"{port_type}Ports{role}{proto}{state}"
         self.r.hset(hash_key, key_name, str(data))
         self.mark_profile_tw_as_modified(profileid, twid, starttime)
 
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in getFinalStateFromFlags() in database.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
-
     def get_data_from_profile_tw(
         self,
         profileid: str,
@@ -722,14 +589,14 @@ def add_ips(self, profileid, twid, flow, role):
         self.update_times_contacted(ip, direction, profileid, twid)
 
         # Get the state. Established, NotEstablished
-        summaryState = self.get_final_state_from_flags(flow.state, flow.pkts)
-        key_name = f"{direction}IPs{role}{flow.proto.upper()}{summaryState}"
+        state = get_final_state_from_flags(flow.state, flow.pkts)
+        key_name = f"{direction}IPs{role}{flow.proto.upper()}{state}"
         # Get the previous data about this key
         old_profileid_twid_data = self.get_data_from_profile_tw(
             profileid,
             twid,
             direction,
-            summaryState,
+            state,
             flow.proto,
             role,
             "IPs",
@@ -806,7 +673,7 @@ def add_flow(
         The profileid is the main profile that this flow is related too.
         : param new_profile_added : is set to True for everytime we see a new srcaddr
         """
-        summary_state = self.get_final_state_from_flags(flow.state, flow.pkts)
+        summary_state = get_final_state_from_flags(flow.state, flow.pkts)
         flow_dict = {
             "ts": flow.starttime,
             "dur": flow.dur,
diff --git a/slips_files/core/database/sqlite_db/database.py b/slips_files/core/database/sqlite_db/database.py
index 4792ea67c9..4dd52dbfc1 100644
--- a/slips_files/core/database/sqlite_db/database.py
+++ b/slips_files/core/database/sqlite_db/database.py
@@ -31,11 +31,13 @@ def connect(self):
         """
         db_newly_created = False
         if not os.path.exists(self._flows_db):
-            # db not created, mark it as first time accessing it so we can init tables once we connect
+            # db not created, mark it as first time accessing it so we can
+            # init tables once we connect
             db_newly_created = True
             self._init_db()
 
-        # you can get multithreaded access on a single pysqlite connection by passing "check_same_thread=False"
+        # you can get multithreaded access on a single pysqlite connection
+        # by passing "check_same_thread=False"
         self.conn = sqlite3.connect(
             self._flows_db, check_same_thread=False, timeout=20
         )

From f0eb12f0053b15d98e426a5459374d82f2919807 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:36:55 +0200
Subject: [PATCH 005/455] mlflow. Ignore UID column

---
 modules/flowmldetection/flowmldetection.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c57a7a3581..e2aa1e0ee3 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -141,6 +141,7 @@ def process_features(self, dataset):
                 "dpkts",
                 "smac",
                 "dmac",
+                "uid",
             ]
             for field in to_drop:
                 try:

From 6bc8351cf891c12bf16f4d298a8f3f50c0506850 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 006/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++-
 1 file changed, 150 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9e0aa772cd..c8226368c7 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -11,6 +11,7 @@
 import datetime
 import traceback
 import warnings
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -121,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -133,7 +269,7 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # For now, discard the ports
+            # For now, discard these
             to_drop = [
                 "appproto",
                 "daddr",
@@ -156,15 +292,25 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # So transform here
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
+
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
-                r"(^.*NotEstablished.*$)", "0", regex=True
+                r"(^.*Not Established.*$)", "0", regex=True
             )
             dataset.state = dataset.state.str.replace(
                 r"(^.*Established.*$)", "1", regex=True
             )
-            # Convert proto to categorical. For now we only have few states,
-            # so we can hardcode...
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
             # We dont use the data to create categories because in testing mode
             # we dont see all the protocols
             # Also we dont store the Categorizer because the user can retrain

From 5489ab209a6c96f03f8afd73c7ce7f31a78382f2 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 007/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 3776649a496c4b2b40962752b19f961e047f21bd Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 008/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From f75e88b9a312b22ca6b14af438bd43a0a428a36c Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 009/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c8226368c7..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -292,12 +157,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 98651fd08c..2f81ecd8ef 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -607,9 +607,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 47193d79912875918ab9e5612b617b3c4ec42886 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 010/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++-
 1 file changed, 150 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9e0aa772cd..c8226368c7 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -11,6 +11,7 @@
 import datetime
 import traceback
 import warnings
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -121,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -133,7 +269,7 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # For now, discard the ports
+            # For now, discard these
             to_drop = [
                 "appproto",
                 "daddr",
@@ -156,15 +292,25 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # So transform here
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
+
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
-                r"(^.*NotEstablished.*$)", "0", regex=True
+                r"(^.*Not Established.*$)", "0", regex=True
             )
             dataset.state = dataset.state.str.replace(
                 r"(^.*Established.*$)", "1", regex=True
             )
-            # Convert proto to categorical. For now we only have few states,
-            # so we can hardcode...
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
             # We dont use the data to create categories because in testing mode
             # we dont see all the protocols
             # Also we dont store the Categorizer because the user can retrain

From 0de55cb022a1c84cc642febf6383e9d314510a23 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 011/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From cfa52224d7aee90e9ce0cf5e68625360564b3181 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 012/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 8d14ef8a2803807e785f1bc4222ea2f391dd46e1 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 013/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c8226368c7..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -292,12 +157,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 9ecc16635f..4de72c756f 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -607,9 +607,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From fc735e7374e409de67c16e6a4b6e392efbc5d603 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 014/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..94eb27afdf 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 45ea08b585d956c0fb483cf789ec974111f0d6b5 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 015/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 94eb27afdf..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From a3ff70d540ab30bd35686eb5c4b338bbff17aa25 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 016/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++-
 1 file changed, 150 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3a3b0a72fe..8917fef6a5 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -120,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -132,7 +268,7 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # For now, discard the ports
+            # For now, discard these
             to_drop = [
                 "appproto",
                 "daddr",
@@ -155,15 +291,25 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # So transform here
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
+
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
-                r"(^.*NotEstablished.*$)", "0", regex=True
+                r"(^.*Not Established.*$)", "0", regex=True
             )
             dataset.state = dataset.state.str.replace(
                 r"(^.*Established.*$)", "1", regex=True
             )
-            # Convert proto to categorical. For now we only have few states,
-            # so we can hardcode...
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
             # We dont use the data to create categories because in testing mode
             # we dont see all the protocols
             # Also we dont store the Categorizer because the user can retrain

From 385d1e2cf142e677602dc80c94d9ecd5c6c0896b Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 017/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From d8783fd1d1b85e1e39aa4d2b05520a95463e52da Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 018/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 242ab4633538e6632d8418cf5df33469d8dfc585 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 019/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 8917fef6a5..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -291,12 +156,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index d0d586c4c0..e0028e813b 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -610,9 +610,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 06d18ac0a03710092ed0be96eeec10cf89cb2ecf Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 020/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..12c3589edc 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 22731c9987bee24e4848658b095918ebd40ffdc0 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 021/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 12c3589edc..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 88dd4e6a6a527f021269be0c022f403b1ba23961 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 022/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..12c3589edc 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 5b81164e3e7be0145975fbe7016021614bdeafd5 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 023/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 12c3589edc..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From c0c5e537c723578ceae0ec4002b25d882d37ec36 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 024/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..12c3589edc 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 48cf9d05e63b9d09e44536dc77da6553118561ed Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 025/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 12c3589edc..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From e2aaf16170aefd3350c32122988f29633454c260 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:23:29 +0100
Subject: [PATCH 026/455] Re add function that alya added

---
 slips_files/core/database/database_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index e0028e813b..d0d586c4c0 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -610,6 +610,9 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
+    def get_final_state_from_flags(self, *args, **kwargs):
+        return self.rdb.get_final_state_from_flags(*args, **kwargs)
+
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 5b87d35ad971e343d73daa846350d6277682e3ba Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:25:03 +0100
Subject: [PATCH 027/455] delete sys

---
 modules/flowmldetection/flowmldetection.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..286a397eff 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,7 +10,6 @@
 import json
 import traceback
 import warnings
-import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils

From 08dec989d4d1bb54ecf4922f294bcbee5c264ab3 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:27:23 +0100
Subject: [PATCH 028/455] Delete file that was deleted from develop

---
 slips_files/common/state_handler.py | 170 ----------------------------
 1 file changed, 170 deletions(-)
 delete mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
deleted file mode 100644
index d0a05115bd..0000000000
--- a/slips_files/common/state_handler.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Optional
-
-
-def interpret_suricata_states(state) -> Optional[str]:
-    """
-    There are different states in which a flow can be.
-    Suricata distinguishes three flow-states for TCP and two for
-     UDP. For TCP,
-    these are: New, Established and Closed,for UDP only new and
-    established.
-    For each of these states Suricata can employ different timeouts.
-    """
-    if "new" in state or "established" in state:
-        return "Established"
-    elif "closed" in state:
-        return "Not Established"
-
-
-def interpret_zeek_states(state) -> Optional[str]:
-    # We have varius type of states depending on the type of flow.
-    # For Zeek
-    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-        return "Not Established"
-    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-        return "Established"
-
-
-def interpret_argus_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
-    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-        """
-        Examples:
-        SA_SA
-        SR_SA
-        FSRA_SA
-        SPA_SPA
-        SRA_SPA
-        FSA_FSA
-        FSA_FSPA
-        SAEC_SPA
-        SRPA_SPA
-        FSPA_SPA
-        FSRPA_SPA
-        FSPA_FSPA
-        FSRA_FSPA
-        SRAEC_SPA
-        FSPA_FSRPA
-        FSAEC_FSPA
-        FSRPA_FSPA
-        SRPAEC_SPA
-        FSPAEC_FSPA
-        SRPAEC_FSRPA
-        """
-        return "Established"
-    elif "PA" in pre and "PA" in suf:
-        # Tipical flow that was reported in the middle
-        """
-        Examples:
-        PA_PA
-        FPA_FPA
-        """
-        return "Established"
-    elif "ECO" in pre:
-        return "ICMP Echo"
-    elif "ECR" in pre:
-        return "ICMP Reply"
-    elif "URH" in pre:
-        return "ICMP Host Unreachable"
-    elif "URP" in pre:
-        return "ICMP Port Unreachable"
-    else:
-        """
-        Examples:
-        S_RA
-        S_R
-        A_R
-        S_SA
-        SR_SA
-        FA_FA
-        SR_RA
-        SEC_RA
-        """
-        return "Not Established"
-
-
-def interpret_tcp_states(state, pkts) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "EST" in pre:
-        # TCP
-        return "Established"
-    elif "RST" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are reseted when finished and therefore are
-        # established
-        # It can happen that is reseted being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    elif "FIN" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are finished with FIN when finished and
-        # therefore are established
-        # It can happen that is finished being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    else:
-        """
-        Examples:
-        S_
-        FA_
-        PA_
-        FSA_
-        SEC_
-        SRPA_
-        """
-        return "Not Established"
-
-
-def interpret_udp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "CON" in pre:
-        # UDP
-        return "Established"
-    elif "INT" in pre:
-        # UDP trying to connect, NOT preciselly not established but also
-        # NOT 'Established'. So we considered not established because there
-        # is no confirmation of what happened.
-        return "Not Established"
-
-
-def interpret_icmp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "ECO" in pre:
-        # ICMP
-        return "Established"
-    elif "UNK" in pre:
-        # ICMP6 unknown upper layer
-        return "Established"
-
-
-def get_final_state_from_flags(state, pkts) -> str:
-    """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
-    """
-
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
-
-    return "Not Established"

From 0c4455c108509246993c8aa081310f9c0ce5a240 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:32:01 +0100
Subject: [PATCH 029/455] Flowmldetection. Fix missing db reference

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 286a397eff..fac5e674f9 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -160,7 +160,7 @@ def process_features(self, dataset):
             # 'Not Established', it is still 'S0' and others
             # So transform here
             dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
+                lambda row: self.db.get_final_state_from_flags(
                     row["state"], row["pkts"]
                 ),
                 axis=1,

From 3429549c6326c9c7d7b9bc299fef48d6b754fb48 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:08 +0100
Subject: [PATCH 030/455] Fix the training of flows with ML in new version

---
 modules/flowmldetection/flowmldetection.py | 144 +++++++++++----------
 1 file changed, 77 insertions(+), 67 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fac5e674f9..e6ea0b5171 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -55,8 +55,12 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new lables needed to trigger the train
-        self.minimum_lables_to_retrain = 50
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained
+        self.last_number_of_flows_when_trained = 0
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
@@ -67,26 +71,25 @@ def init(self):
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Process the labels to have only Normal and Malware
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*ormal.*$)", "Normal", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alware.*$)", "Malware", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alicious.*$)", "Malware", regex=True
-            )
+            # Get the flows from the DB
+            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
+            # Convert to pandas df
+            # self.flows = pd.DataFrame(self.flows)
+            # Process the features
+            # X_flow = self.process_features(self.flows)
 
-            # Separate
-            y_flow = self.flows["label"]
+            # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("label", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.label)
+            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -95,7 +98,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malware", "Normal"]
+                    X_flow, y_flow, classes=["Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -118,7 +121,7 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train()", 0, 1)
+            self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -144,9 +147,7 @@ def process_features(self, dataset):
                 "history",
                 "uid",
                 "dir_",
-                "dbytes",
                 "endtime",
-                "bytes",
                 "flow_source",
             ]
             for field in to_drop:
@@ -161,11 +162,10 @@ def process_features(self, dataset):
             # So transform here
             dataset["state"] = dataset.apply(
                 lambda row: self.db.get_final_state_from_flags(
-                    row["state"], row["pkts"]
+                    row["state"], (row["spkts"] + row["dpkts"])
                 ),
                 axis=1,
             )
-            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -199,7 +199,11 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            fields_to_convert_to_flow = [
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
                 dataset.proto,
                 dataset.dport,
                 dataset.sport,
@@ -210,10 +214,10 @@ def process_features(self, dataset):
                 dataset.sbytes,
                 dataset.state,
             ]
-            for field in fields_to_convert_to_flow:
+            for field in fields_to_convert_to_float:
                 try:
                     field = field.astype("float64")
-                except ValueError:
+                except (ValueError, AttributeError):
                     pass
 
             return dataset
@@ -222,9 +226,9 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flows(self):
+    def process_training_flows(self):
         """
-        Process all the flwos in the DB
+        Process all the flows in the DB
         Store the pandas df in self.flows
         """
         try:
@@ -240,44 +244,48 @@ def process_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
+
+                # These flows should be in the same format as the ones in the DB. 
+                # Which means the satate is still SF, S0, etc.
                 flows.append(
                     {
-                        "ts": 1594417039.029793,
+                        "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 42764,
-                        "spkts": 37,
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
                         "sbytes": 25517,
+                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malware",
+                        "label": "Malicious",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malware"
+                            "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "ts": 1382355032.706468,
+                        "starttime": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 67696,
+                        "state": "SF",
                         "spkts": 1,
+                        "dpkts": 0,
                         "sbytes": 100,
+                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Normal",
+                        "label": "Benign",
                         "module_labels": {
-                            "flowalerts-long-connection": "Normal"
+                            "flowalerts-long-connection": "Benign"
                         },
                     }
                 )
@@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
         and returns the predection array
         """
         try:
-            given_x_flow = x_flow
             # clean the flow
             fields_to_drop = [
                 "label",
@@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "uid",
                 "history",
                 "dir_",
-                "dbytes",
-                "dpkts",
                 "endtime",
-                "bytes",
                 "flow_source",
             ]
             for field in fields_to_drop:
@@ -343,7 +347,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             return pred
         except Exception as e:
             self.print(
-                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
             )
             self.print(traceback.format_exc(), 0, 1)
 
@@ -435,18 +439,16 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
+            # When a new flow arrives
             msg = json.loads(msg["data"])
-            twid = msg["twid"]
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
             self.flow = msg["flow"]
-            # these fields are expected in testing. update the original
+            # These following extra fields are expected in testing. update the original
             # flow dict to have them
             self.flow.update(
                 {
-                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
-                    # the flow["state"] is the origstate, we dont need that here
-                    # we need the interpreted state
                     "state": msg["interpreted_state"],
-                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
                     "label": msg["label"],
                     "module_labels": msg["module_labels"],
                 }
@@ -459,23 +461,31 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_lables_to_retrain
-                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain'
-                    # amount of labels
-                    # So for example we retrain every 100 labels and only when
-                    # we have at least 100 labels
-                    self.print(
-                        f"Training the model with the last group of "
-                        f"flows and labels. Total flows: {sum_labeled_flows}."
-                    )
-                    # Process all flows in the DB and make them ready
-                    # for pandas
-                    self.process_flows()
-                    # Train an algorithm
-                    self.train()
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows()
+                        # Train an algorithm
+                        self.train()
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
@@ -495,8 +505,8 @@ def main(self):
                         # and the label is diff from the prediction,
                         # print in debug mode
                         self.print(
-                            f"Report Prediction {pred[0]} for label"
-                            f' {label} flow {self.flow["saddr"]}:'
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} ->'
                             f' {self.flow["daddr"]}:'
                             f'{self.flow["dport"]}/'
@@ -504,9 +514,9 @@ def main(self):
                             0,
                             3,
                         )
-                    if pred[0] == "Malware":
+                    if pred[0] == "Malicious":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, twid)
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
                             f"Prediction {pred[0]} for label {label}"
                             f' flow {self.flow["saddr"]}:'

From a779358bb3a6f8d72446c45c8b3feaf1406c87f4 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:29 +0100
Subject: [PATCH 031/455] Fix the profiler handler for cases of nan in state

---
 .../core/database/redis_db/profile_handler.py     | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index 0489372cdc..1ea7644648 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -379,7 +379,12 @@ def get_final_state_from_flags(self, state, pkts):
         We receive the pakets to distinguish some Reset connections
         """
         try:
-            pre = state.split("_")[0]
+            # In some flows the state is a nan
+            try:
+                pre = state.split("_")[0]
+            except AttributeError:
+                pre = ''
+
             try:
                 # Try suricata states
                 """
@@ -401,7 +406,11 @@ def get_final_state_from_flags(self, state, pkts):
                     return "Established"
 
                 # For Argus
-                suf = state.split("_")[1]
+            # In some flows the state is a nan
+                try:
+                    suf = state.split("_")[1]
+                except AttributeError:
+                    suf = ''
                 if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
                     """
                     Examples:
@@ -502,7 +511,7 @@ def get_final_state_from_flags(self, state, pkts):
         except Exception:
             exception_line = sys.exc_info()[2].tb_lineno
             self.print(
-                f"Error in getFinalStateFromFlags() in database.py line {exception_line}",
+                f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}",
                 0,
                 1,
             )

From 223d72d0948098bb30f3a0992ac978f2249a9c35 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:20:42 +0100
Subject: [PATCH 032/455] slips.yaml. Update to have correct labels. By default
 test. Defaul training lbel is benign

---
 config/slips.yaml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/config/slips.yaml b/config/slips.yaml
index 34f41e7109..31847a6df4 100644
--- a/config/slips.yaml
+++ b/config/slips.yaml
@@ -106,13 +106,12 @@ parameters:
   deletePrevdb: true
 
   # Set the label for all the flows that are being read.
-  # For now only normal and malware directly. No option for setting labels
-  # with a filter
+  # For now only Benign and Malicious (Capitalized)
   # The purpose is to be used in the training of ML models and to output
   # flows with labels for other tools.
-  # label: malicious
-  # label: unknown
-  label: normal
+  # label: Malicious
+  # label: Benign
+  label: Benign
   # If Zeek files are rotated or not to avoid running out of disk.
   # Zeek rotation is enabled by default when using an interface,
   # which means Slips will delete all Zeek log files after 1 day

From 18b9a9559b08a4675248e2437eae7b271ab9ec94 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:21:21 +0100
Subject: [PATCH 033/455] First ipython to tst ML flow related models

---
 modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb

diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb
new file mode 100644
index 0000000000..d726cd2805
--- /dev/null
+++ b/modules/flowmldetection/flowmlanalysis.ipynb
@@ -0,0 +1,76 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis of Flows with Machine Learning for Slips"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis of a fixed list of flows to try techniques and find parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy\n",
+    "from sklearn.linear_model import SGDClassifier\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "import pickle\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "import traceback\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "slips-new",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 5c89e4db5a40fda5b1cce21996c684d36c93d667 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:22:38 +0100
Subject: [PATCH 034/455] flowml. If the dataset has one flow and that is
 deleted, then return empty fast.

---
 modules/flowmldetection/flowmldetection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e6ea0b5171..0fa1e4d767 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -135,6 +135,11 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
             # For now, discard these
             to_drop = [
                 "appproto",

From ebbfd953cb028cf9ef0b75cd17168fc70f6921b0 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:23:05 +0100
Subject: [PATCH 035/455] flowml. If the datasert is empty. Return none

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 0fa1e4d767..5c5f9943f1 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
             # Update the flow to the processed version
             return dflow
         except Exception:

From 13287d134eb09ac30dcb0e056d5465544d545591 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:26:42 +0100
Subject: [PATCH 036/455] profile_handler. Small bug in how we handled the
 profiles, we were using 'in' instead of == for established. Some not
 established MAY not have been correctly captured

---
 slips_files/core/database/redis_db/profile_handler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index 1ea7644648..85fdec5a63 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -393,9 +393,10 @@ def get_final_state_from_flags(self, state, pkts):
                 these are: New, Established and Closed,for UDP only new and established.
                 For each of these states Suricata can employ different timeouts.
                 """
-                if "new" in state or "established" in state:
+                # This is controversial, but if we dont have a good state, we consider it not established for now
+                if "new" in state or state.lower() == "established":
                     return "Established"
-                elif "closed" in state:
+                elif "closed" in state or state.lower() == 'not established':
                     return "Not Established"
 
                 # We have varius type of states depending on the type of flow.
@@ -406,7 +407,6 @@ def get_final_state_from_flags(self, state, pkts):
                     return "Established"
 
                 # For Argus
-            # In some flows the state is a nan
                 try:
                     suf = state.split("_")[1]
                 except AttributeError:

From 9588762aa88da736012b2b6f5844f3ca0c39f15c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:27:16 +0100
Subject: [PATCH 037/455] First new version of the model and scaler. Not good
 yet, but working.

---
 modules/flowmldetection/model.bin  | Bin 1073 -> 1090 bytes
 modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644
GIT binary patch
delta 411
zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice!
z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWp<G
zkd+9;IY10jlMJQvfpjSl7X$GWpm9B7@u?LBiIY2+gw>QXRJ|Emr(~!@tY-9P0&;YE
zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1
z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8
zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q
b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5

delta 380
zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ
z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o
zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF
za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN
zW*@<QMemXQ6rg!MT$y<#sYS`D1tsxQGBDi*^nz8VGl$HwGgBKKcpSi5_$D7__GNWU
Y$oltSvM7r(H=BW;k%^wsl+q+U0L_+r^#A|>

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644
GIT binary patch
literal 890
zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD<
z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*<Bn8N3NzN~*1k!?Osfi_}MXB+5
ziMgr8@tJw?Q+h=6N^=V;^^)_8QuT66b4oH3i;5B}r}Xf|7o{fW=M|R}l_r+}?dV|z
zE1c594ze4hWbzbmhSn+0j7d}4rvy#W@MiSpZk>|B)5DrlQdy7+(!%6#F{QHuBFo&v
zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n
zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ
zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~
zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E;
z<KX(aCF;No0f(>8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k<U{t7-@NM2t?O`b(oOJr
z^gz^M3Md#@$`XrkInNqs`NcqG+rqF4hjd*#=9SUs?C-J63^i@q?kIRi^+W#GZH^`;
z_s<+FuX0#b@Z7FWCEbCm^j(f1Uy0+U$5+2R=gD#Gag=$mqjtZ2f~w@S(sxacYyTMt
zX(&vBSPl-{RD9NRUiR6)W4EzGNqG>5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp
z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg
dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_

delta 525
zcmeyxHj9<Dfn{nx(?r(zNWQ%IwA93s(xTMj_{_ZcDc+nt{CV-kiMa(iK#}6qRG?Uo
zXkKY<L8V@Deo?AkZfQ<QW@1rMV&#+`w&LW(oW!Cjlc)5sf|UU2l#<GVRG`S@UPd*o
zK3xVdn9?>S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq
z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5m<hihBT2gkmB>hSAF{H=RwvmLf>
z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG
zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%Yf<K@j$
z%OkDN+J7s4Zlk~8nEkR&OOX||2OO>Q^>gH&y>a4SnPazubB5!cqc1YnCGT<q+XfD^
zRFHk>))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ
z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4<MO)ScbFH0>d&dkpP
QMw@}2k%^wsl+q+U0P2|NZU6uP


From 531946f0f0d880cc68dd95991a08b387b6a78c39 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:06 +0100
Subject: [PATCH 038/455] model and scaler with 1 malicious and 1 benign

---
 modules/flowmldetection/model.bin  | Bin 1090 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644
GIT binary patch
delta 132
zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSY<wU!LGn&}gUNx$3tLYnGl@A&e4S}J
zBSY*!uI-OaTr&&~Tw#uuJJhD?&}O=|yn5R|`<q--?Rc;A9e69=F=gK7d-lipbZtG?
j>K$N!g2|oC+8oRs(l<p`Iv<$4hdGq_%F%|&A}o3UHQY43

delta 131
zcmV-}0DS-C2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
lK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162cym;jSD1TGrII8guq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index bfba4d107224e5e6e5a1e8c8f4d463b48131d111..758909b289238ff282b2e056a9b3e83768b8472a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi%8H3jhEB

delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC


From e254123c7fc393eb4f9d6a05b2a7c3561b4d5d29 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:27 +0100
Subject: [PATCH 039/455] cleaner jupyter

---
 modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 5c5f9943f1..fe950ed4bb 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "endtime",
                 "flow_source",
             ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)

From 473b0958153803624f757ac7b3bb85ffb9d68930 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 22:26:27 +0100
Subject: [PATCH 040/455] New models after 3rd ttrain

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644
GIT binary patch
delta 99
zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~Cof{2$owSaz+^KPJplbX
BD~|vG

delta 99
zcmV-p0G$8i2;>N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q;
zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG
F1TN2UERg^J

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644
GIT binary patch
delta 43
zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G|lbQl%0g;md19^W6
B5HkP(

delta 43
zcmV+`0M!5b2KolDfdU!a4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G}lbQl%0g#gc19^VR
B5HSD%


From d3fd37a4eb223ebc6eca5a085b1b6e51d6e720a7 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 00:08:50 +0100
Subject: [PATCH 041/455] Models after 4th train

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644
GIT binary patch
delta 120
zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
as{<<t5IC<!D(NIZlimYX0{noJHUuueR5UOE

delta 120
zcmaFD@q}YTFmr_2vYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~;rz*~nYB5XJEU)ltaLsw
P`5to^Q^<kIW-NLDlQb|V

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 821344a0c69d116622b02e2a0daa1554cb5d308e..29df65342047c5a499ee3f8e602d1f47cb7e9fca 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8kA
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi>ty4FCWD


From 9efd09bf3ca9fc3de4899135fd286db07b8df3d8 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 08:28:59 +0100
Subject: [PATCH 042/455] Models of ml flow with the first good performance in
 small tests

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644
GIT binary patch
delta 121
zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1();
zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y<G(@KnDch(0mn<s!fm-ctwS<Lp0^
bs{<<tAstXx>?b6^limYW1McwvlQsk{8#y@u

delta 121
zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
bs{<<t5IC<!D(NIZlimYW1LXXGlQsk{_eeDq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 29df65342047c5a499ee3f8e602d1f47cb7e9fca..17115724b9536f6093f9d72f3b58a5c22c562a9a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK

delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T


From 978f87cb89f5ec6dfdea380afb76aa952b77bb38 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 043/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++-
 1 file changed, 150 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e44ac83f4d..16b67e9038 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -120,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -132,7 +268,7 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # For now, discard the ports
+            # For now, discard these
             to_drop = [
                 "appproto",
                 "daddr",
@@ -155,15 +291,25 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # So transform here
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
+
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
-                r"(^.*NotEstablished.*$)", "0", regex=True
+                r"(^.*Not Established.*$)", "0", regex=True
             )
             dataset.state = dataset.state.str.replace(
                 r"(^.*Established.*$)", "1", regex=True
             )
-            # Convert proto to categorical. For now we only have few states,
-            # so we can hardcode...
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
             # We dont use the data to create categories because in testing mode
             # we dont see all the protocols
             # Also we dont store the Categorizer because the user can retrain

From a5dd40500fc88636982bcbb9dd8bf05803dbb3cc Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 044/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 3579edc92ec3832c3116a3180af419029cb89b66 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 045/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 47d65ed1ef5545777e0aef73e13ba14dd231b51c Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 046/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 16b67e9038..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -291,12 +156,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index e8ca3aaf62..b4b2128d3d 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 55ce0bbf1fdb8ae5ebeea066fd3efe07cab9a0b8 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 047/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From a4446a54dfcb2299392a2e3a59d0d755de693153 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 048/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From eead7b56753dba0923ef0ac41be1e3361ec70cd3 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 049/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 43ab23bbd9a7699efed3d731d83df88e53afa451 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 050/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From df9417d840129cee1864a0a86c6ef33e82db1038 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 051/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From f5c4e0c67f148ad3f312ecea5801af9fd28a1877 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 052/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From eda2d83e77f223bc8e436aae6bd17b7eb6c83ece Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 053/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From ed1997e45fbed8609b9ffb8787dbb61059d5d7a2 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 054/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From c0d8b16d7fc7c2a404c5d0f3c18768bebd49aa0f Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 055/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 00e2ab175a7d00fca76d09433be4df6141ff4316 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 056/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From ee417b90a570747d57c2bffc75ec54f0c3e22c73 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:23:29 +0100
Subject: [PATCH 057/455] Re add function that alya added

---
 slips_files/core/database/database_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index b4b2128d3d..e8ca3aaf62 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
+    def get_final_state_from_flags(self, *args, **kwargs):
+        return self.rdb.get_final_state_from_flags(*args, **kwargs)
+
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 774e03dc5598ccd9627fd1e1aece3b9e883f38fa Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:25:03 +0100
Subject: [PATCH 058/455] delete sys

---
 modules/flowmldetection/flowmldetection.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..c06755a599 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,7 +10,6 @@
 import json
 import traceback
 import warnings
-import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils

From 6220c230c86e0cbfd8148829e684335cc62f2a8e Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:27:23 +0100
Subject: [PATCH 059/455] Delete file that was deleted from develop

---
 slips_files/common/state_handler.py | 170 ----------------------------
 1 file changed, 170 deletions(-)
 delete mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
deleted file mode 100644
index d0a05115bd..0000000000
--- a/slips_files/common/state_handler.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Optional
-
-
-def interpret_suricata_states(state) -> Optional[str]:
-    """
-    There are different states in which a flow can be.
-    Suricata distinguishes three flow-states for TCP and two for
-     UDP. For TCP,
-    these are: New, Established and Closed,for UDP only new and
-    established.
-    For each of these states Suricata can employ different timeouts.
-    """
-    if "new" in state or "established" in state:
-        return "Established"
-    elif "closed" in state:
-        return "Not Established"
-
-
-def interpret_zeek_states(state) -> Optional[str]:
-    # We have varius type of states depending on the type of flow.
-    # For Zeek
-    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-        return "Not Established"
-    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-        return "Established"
-
-
-def interpret_argus_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
-    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-        """
-        Examples:
-        SA_SA
-        SR_SA
-        FSRA_SA
-        SPA_SPA
-        SRA_SPA
-        FSA_FSA
-        FSA_FSPA
-        SAEC_SPA
-        SRPA_SPA
-        FSPA_SPA
-        FSRPA_SPA
-        FSPA_FSPA
-        FSRA_FSPA
-        SRAEC_SPA
-        FSPA_FSRPA
-        FSAEC_FSPA
-        FSRPA_FSPA
-        SRPAEC_SPA
-        FSPAEC_FSPA
-        SRPAEC_FSRPA
-        """
-        return "Established"
-    elif "PA" in pre and "PA" in suf:
-        # Tipical flow that was reported in the middle
-        """
-        Examples:
-        PA_PA
-        FPA_FPA
-        """
-        return "Established"
-    elif "ECO" in pre:
-        return "ICMP Echo"
-    elif "ECR" in pre:
-        return "ICMP Reply"
-    elif "URH" in pre:
-        return "ICMP Host Unreachable"
-    elif "URP" in pre:
-        return "ICMP Port Unreachable"
-    else:
-        """
-        Examples:
-        S_RA
-        S_R
-        A_R
-        S_SA
-        SR_SA
-        FA_FA
-        SR_RA
-        SEC_RA
-        """
-        return "Not Established"
-
-
-def interpret_tcp_states(state, pkts) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "EST" in pre:
-        # TCP
-        return "Established"
-    elif "RST" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are reseted when finished and therefore are
-        # established
-        # It can happen that is reseted being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    elif "FIN" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are finished with FIN when finished and
-        # therefore are established
-        # It can happen that is finished being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    else:
-        """
-        Examples:
-        S_
-        FA_
-        PA_
-        FSA_
-        SEC_
-        SRPA_
-        """
-        return "Not Established"
-
-
-def interpret_udp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "CON" in pre:
-        # UDP
-        return "Established"
-    elif "INT" in pre:
-        # UDP trying to connect, NOT preciselly not established but also
-        # NOT 'Established'. So we considered not established because there
-        # is no confirmation of what happened.
-        return "Not Established"
-
-
-def interpret_icmp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "ECO" in pre:
-        # ICMP
-        return "Established"
-    elif "UNK" in pre:
-        # ICMP6 unknown upper layer
-        return "Established"
-
-
-def get_final_state_from_flags(state, pkts) -> str:
-    """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
-    """
-
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
-
-    return "Not Established"

From 4d2dd99cbec81de085e35ce087ab8ac634908768 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:32:01 +0100
Subject: [PATCH 060/455] Flowmldetection. Fix missing db reference

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c06755a599..87e07c7592 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -160,7 +160,7 @@ def process_features(self, dataset):
             # 'Not Established', it is still 'S0' and others
             # So transform here
             dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
+                lambda row: self.db.get_final_state_from_flags(
                     row["state"], row["pkts"]
                 ),
                 axis=1,

From f0e53cfc658c31e6046ed2cf4741819c89517576 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:08 +0100
Subject: [PATCH 061/455] Fix the training of flows with ML in new version

---
 modules/flowmldetection/flowmldetection.py | 144 +++++++++++----------
 1 file changed, 77 insertions(+), 67 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 87e07c7592..e91495d649 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -55,8 +55,12 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new lables needed to trigger the train
-        self.minimum_lables_to_retrain = 50
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained
+        self.last_number_of_flows_when_trained = 0
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
@@ -67,26 +71,25 @@ def init(self):
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Process the labels to have only Normal and Malware
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*ormal.*$)", "Normal", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alware.*$)", "Malware", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alicious.*$)", "Malware", regex=True
-            )
+            # Get the flows from the DB
+            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
+            # Convert to pandas df
+            # self.flows = pd.DataFrame(self.flows)
+            # Process the features
+            # X_flow = self.process_features(self.flows)
 
-            # Separate
-            y_flow = self.flows["label"]
+            # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("label", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.label)
+            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -95,7 +98,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malware", "Normal"]
+                    X_flow, y_flow, classes=["Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -118,7 +121,7 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train()", 0, 1)
+            self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -144,9 +147,7 @@ def process_features(self, dataset):
                 "history",
                 "uid",
                 "dir_",
-                "dbytes",
                 "endtime",
-                "bytes",
                 "flow_source",
             ]
             for field in to_drop:
@@ -161,11 +162,10 @@ def process_features(self, dataset):
             # So transform here
             dataset["state"] = dataset.apply(
                 lambda row: self.db.get_final_state_from_flags(
-                    row["state"], row["pkts"]
+                    row["state"], (row["spkts"] + row["dpkts"])
                 ),
                 axis=1,
             )
-            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -199,7 +199,11 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            fields_to_convert_to_flow = [
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
                 dataset.proto,
                 dataset.dport,
                 dataset.sport,
@@ -210,10 +214,10 @@ def process_features(self, dataset):
                 dataset.sbytes,
                 dataset.state,
             ]
-            for field in fields_to_convert_to_flow:
+            for field in fields_to_convert_to_float:
                 try:
                     field = field.astype("float64")
-                except ValueError:
+                except (ValueError, AttributeError):
                     pass
 
             return dataset
@@ -222,9 +226,9 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flows(self):
+    def process_training_flows(self):
         """
-        Process all the flwos in the DB
+        Process all the flows in the DB
         Store the pandas df in self.flows
         """
         try:
@@ -240,44 +244,48 @@ def process_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
+
+                # These flows should be in the same format as the ones in the DB. 
+                # Which means the satate is still SF, S0, etc.
                 flows.append(
                     {
-                        "ts": 1594417039.029793,
+                        "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 42764,
-                        "spkts": 37,
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
                         "sbytes": 25517,
+                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malware",
+                        "label": "Malicious",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malware"
+                            "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "ts": 1382355032.706468,
+                        "starttime": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 67696,
+                        "state": "SF",
                         "spkts": 1,
+                        "dpkts": 0,
                         "sbytes": 100,
+                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Normal",
+                        "label": "Benign",
                         "module_labels": {
-                            "flowalerts-long-connection": "Normal"
+                            "flowalerts-long-connection": "Benign"
                         },
                     }
                 )
@@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
         and returns the predection array
         """
         try:
-            given_x_flow = x_flow
             # clean the flow
             fields_to_drop = [
                 "label",
@@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "uid",
                 "history",
                 "dir_",
-                "dbytes",
-                "dpkts",
                 "endtime",
-                "bytes",
                 "flow_source",
                 "ground_truth_label",  # todo now we can use them
                 "detailed_ground_truth_label",
@@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             return pred
         except Exception as e:
             self.print(
-                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
             )
             self.print(traceback.format_exc(), 0, 1)
 
@@ -437,18 +441,16 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
+            # When a new flow arrives
             msg = json.loads(msg["data"])
-            twid = msg["twid"]
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
             self.flow = msg["flow"]
-            # these fields are expected in testing. update the original
+            # These following extra fields are expected in testing. update the original
             # flow dict to have them
             self.flow.update(
                 {
-                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
-                    # the flow["state"] is the origstate, we dont need that here
-                    # we need the interpreted state
                     "state": msg["interpreted_state"],
-                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
                     "label": msg["label"],
                     "module_labels": msg["module_labels"],
                 }
@@ -461,23 +463,31 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_lables_to_retrain
-                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain'
-                    # amount of labels
-                    # So for example we retrain every 100 labels and only when
-                    # we have at least 100 labels
-                    self.print(
-                        f"Training the model with the last group of "
-                        f"flows and labels. Total flows: {sum_labeled_flows}."
-                    )
-                    # Process all flows in the DB and make them ready
-                    # for pandas
-                    self.process_flows()
-                    # Train an algorithm
-                    self.train()
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows()
+                        # Train an algorithm
+                        self.train()
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
@@ -497,8 +507,8 @@ def main(self):
                         # and the label is diff from the prediction,
                         # print in debug mode
                         self.print(
-                            f"Report Prediction {pred[0]} for label"
-                            f' {label} flow {self.flow["saddr"]}:'
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} ->'
                             f' {self.flow["daddr"]}:'
                             f'{self.flow["dport"]}/'
@@ -506,9 +516,9 @@ def main(self):
                             0,
                             3,
                         )
-                    if pred[0] == "Malware":
+                    if pred[0] == "Malicious":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, twid)
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
                             f"Prediction {pred[0]} for label {label}"
                             f' flow {self.flow["saddr"]}:'

From b7e82cf985596d60b66ee7ac7d2a7052a0b986dc Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:29 +0100
Subject: [PATCH 062/455] Fix the profiler handler for cases of nan in state

---
 .../core/database/redis_db/profile_handler.py     | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index 0489372cdc..1ea7644648 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -379,7 +379,12 @@ def get_final_state_from_flags(self, state, pkts):
         We receive the pakets to distinguish some Reset connections
         """
         try:
-            pre = state.split("_")[0]
+            # In some flows the state is a nan
+            try:
+                pre = state.split("_")[0]
+            except AttributeError:
+                pre = ''
+
             try:
                 # Try suricata states
                 """
@@ -401,7 +406,11 @@ def get_final_state_from_flags(self, state, pkts):
                     return "Established"
 
                 # For Argus
-                suf = state.split("_")[1]
+            # In some flows the state is a nan
+                try:
+                    suf = state.split("_")[1]
+                except AttributeError:
+                    suf = ''
                 if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
                     """
                     Examples:
@@ -502,7 +511,7 @@ def get_final_state_from_flags(self, state, pkts):
         except Exception:
             exception_line = sys.exc_info()[2].tb_lineno
             self.print(
-                f"Error in getFinalStateFromFlags() in database.py line {exception_line}",
+                f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}",
                 0,
                 1,
             )

From ccde23ede2ac27f38809ce5f1bf2e5518c1d73c1 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:20:42 +0100
Subject: [PATCH 063/455] slips.yaml. Update to have correct labels. By default
 test. Defaul training lbel is benign

---
 config/slips.yaml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/config/slips.yaml b/config/slips.yaml
index f7089b41af..8736eaf511 100644
--- a/config/slips.yaml
+++ b/config/slips.yaml
@@ -106,13 +106,12 @@ parameters:
   deletePrevdb: true
 
   # Set the label for all the flows that are being read.
-  # For now only normal and malware directly. No option for setting labels
-  # with a filter
+  # For now only Benign and Malicious (Capitalized)
   # The purpose is to be used in the training of ML models and to output
   # flows with labels for other tools.
-  # label: malicious
-  # label: unknown
-  label: normal
+  # label: Malicious
+  # label: Benign
+  label: Benign
   # If Zeek files are rotated or not to avoid running out of disk.
   # Zeek rotation is enabled by default when using an interface,
   # which means Slips will delete all Zeek log files after 1 day

From 667faa3f1bc572053f530e0e8b3e8ca40ef19976 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:21:21 +0100
Subject: [PATCH 064/455] First ipython to tst ML flow related models

---
 modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb

diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb
new file mode 100644
index 0000000000..d726cd2805
--- /dev/null
+++ b/modules/flowmldetection/flowmlanalysis.ipynb
@@ -0,0 +1,76 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis of Flows with Machine Learning for Slips"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis of a fixed list of flows to try techniques and find parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy\n",
+    "from sklearn.linear_model import SGDClassifier\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "import pickle\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "import traceback\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "slips-new",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From ad1488054068bc9d5bc3b596f04248523ef42a83 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:22:38 +0100
Subject: [PATCH 065/455] flowml. If the dataset has one flow and that is
 deleted, then return empty fast.

---
 modules/flowmldetection/flowmldetection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e91495d649..58b4ce1e4c 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -135,6 +135,11 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
             # For now, discard these
             to_drop = [
                 "appproto",

From 02804ca94b80f7a24374b36ec073af55aa272c3c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:23:05 +0100
Subject: [PATCH 066/455] flowml. If the datasert is empty. Return none

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 58b4ce1e4c..4a4d46e376 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
             # Update the flow to the processed version
             return dflow
         except Exception:

From dea7702d8b5518fc4fc2d2fd5262e45c0ddec65d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:26:42 +0100
Subject: [PATCH 067/455] profile_handler. Small bug in how we handled the
 profiles, we were using 'in' instead of == for established. Some not
 established MAY not have been correctly captured

---
 slips_files/core/database/redis_db/profile_handler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index 1ea7644648..85fdec5a63 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -393,9 +393,10 @@ def get_final_state_from_flags(self, state, pkts):
                 these are: New, Established and Closed,for UDP only new and established.
                 For each of these states Suricata can employ different timeouts.
                 """
-                if "new" in state or "established" in state:
+                # This is controversial, but if we dont have a good state, we consider it not established for now
+                if "new" in state or state.lower() == "established":
                     return "Established"
-                elif "closed" in state:
+                elif "closed" in state or state.lower() == 'not established':
                     return "Not Established"
 
                 # We have varius type of states depending on the type of flow.
@@ -406,7 +407,6 @@ def get_final_state_from_flags(self, state, pkts):
                     return "Established"
 
                 # For Argus
-            # In some flows the state is a nan
                 try:
                     suf = state.split("_")[1]
                 except AttributeError:

From f7f2eb3b80d90e0dc31d3fcfe7394d11650f84f4 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:27:16 +0100
Subject: [PATCH 068/455] First new version of the model and scaler. Not good
 yet, but working.

---
 modules/flowmldetection/model.bin  | Bin 1073 -> 1090 bytes
 modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644
GIT binary patch
delta 411
zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice!
z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWp<G
zkd+9;IY10jlMJQvfpjSl7X$GWpm9B7@u?LBiIY2+gw>QXRJ|Emr(~!@tY-9P0&;YE
zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1
z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8
zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q
b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5

delta 380
zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ
z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o
zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF
za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN
zW*@<QMemXQ6rg!MT$y<#sYS`D1tsxQGBDi*^nz8VGl$HwGgBKKcpSi5_$D7__GNWU
Y$oltSvM7r(H=BW;k%^wsl+q+U0L_+r^#A|>

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644
GIT binary patch
literal 890
zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD<
z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*<Bn8N3NzN~*1k!?Osfi_}MXB+5
ziMgr8@tJw?Q+h=6N^=V;^^)_8QuT66b4oH3i;5B}r}Xf|7o{fW=M|R}l_r+}?dV|z
zE1c594ze4hWbzbmhSn+0j7d}4rvy#W@MiSpZk>|B)5DrlQdy7+(!%6#F{QHuBFo&v
zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n
zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ
zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~
zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E;
z<KX(aCF;No0f(>8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k<U{t7-@NM2t?O`b(oOJr
z^gz^M3Md#@$`XrkInNqs`NcqG+rqF4hjd*#=9SUs?C-J63^i@q?kIRi^+W#GZH^`;
z_s<+FuX0#b@Z7FWCEbCm^j(f1Uy0+U$5+2R=gD#Gag=$mqjtZ2f~w@S(sxacYyTMt
zX(&vBSPl-{RD9NRUiR6)W4EzGNqG>5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp
z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg
dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_

delta 525
zcmeyxHj9<Dfn{nx(?r(zNWQ%IwA93s(xTMj_{_ZcDc+nt{CV-kiMa(iK#}6qRG?Uo
zXkKY<L8V@Deo?AkZfQ<QW@1rMV&#+`w&LW(oW!Cjlc)5sf|UU2l#<GVRG`S@UPd*o
zK3xVdn9?>S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq
z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5m<hihBT2gkmB>hSAF{H=RwvmLf>
z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG
zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%Yf<K@j$
z%OkDN+J7s4Zlk~8nEkR&OOX||2OO>Q^>gH&y>a4SnPazubB5!cqc1YnCGT<q+XfD^
zRFHk>))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ
z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4<MO)ScbFH0>d&dkpP
QMw@}2k%^wsl+q+U0P2|NZU6uP


From 81b103d0dd8ef69f3cadec4ff92e8e6bbe2c0027 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:06 +0100
Subject: [PATCH 069/455] model and scaler with 1 malicious and 1 benign

---
 modules/flowmldetection/model.bin  | Bin 1090 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644
GIT binary patch
delta 132
zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSY<wU!LGn&}gUNx$3tLYnGl@A&e4S}J
zBSY*!uI-OaTr&&~Tw#uuJJhD?&}O=|yn5R|`<q--?Rc;A9e69=F=gK7d-lipbZtG?
j>K$N!g2|oC+8oRs(l<p`Iv<$4hdGq_%F%|&A}o3UHQY43

delta 131
zcmV-}0DS-C2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
lK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162cym;jSD1TGrII8guq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index bfba4d107224e5e6e5a1e8c8f4d463b48131d111..758909b289238ff282b2e056a9b3e83768b8472a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi%8H3jhEB

delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC


From 5d566bb78dbe72893acf9060c3c176806cc87a3c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:27 +0100
Subject: [PATCH 070/455] cleaner jupyter

---
 modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4a4d46e376..d8e9ada27c 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "ground_truth_label",  # todo now we can use them
                 "detailed_ground_truth_label",
             ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)

From 10fee830a3fecf11002d3037e75d8c094d72b4c8 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 22:26:27 +0100
Subject: [PATCH 071/455] New models after 3rd ttrain

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644
GIT binary patch
delta 99
zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~Cof{2$owSaz+^KPJplbX
BD~|vG

delta 99
zcmV-p0G$8i2;>N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q;
zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG
F1TN2UERg^J

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644
GIT binary patch
delta 43
zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G|lbQl%0g;md19^W6
B5HkP(

delta 43
zcmV+`0M!5b2KolDfdU!a4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G}lbQl%0g#gc19^VR
B5HSD%


From d387f75f61bfd03ed5fc41b68c0734945f605bc4 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 00:08:50 +0100
Subject: [PATCH 072/455] Models after 4th train

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644
GIT binary patch
delta 120
zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
as{<<t5IC<!D(NIZlimYX0{noJHUuueR5UOE

delta 120
zcmaFD@q}YTFmr_2vYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~;rz*~nYB5XJEU)ltaLsw
P`5to^Q^<kIW-NLDlQb|V

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 821344a0c69d116622b02e2a0daa1554cb5d308e..29df65342047c5a499ee3f8e602d1f47cb7e9fca 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8kA
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi>ty4FCWD


From cbe0718e114b9413874ab6ccccb42da441dee2c4 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 08:28:59 +0100
Subject: [PATCH 073/455] Models of ml flow with the first good performance in
 small tests

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644
GIT binary patch
delta 121
zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1();
zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y<G(@KnDch(0mn<s!fm-ctwS<Lp0^
bs{<<tAstXx>?b6^limYW1McwvlQsk{8#y@u

delta 121
zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
bs{<<t5IC<!D(NIZlimYW1LXXGlQsk{_eeDq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 29df65342047c5a499ee3f8e602d1f47cb7e9fca..17115724b9536f6093f9d72f3b58a5c22c562a9a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK

delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T


From cf6b939823f5d935a8afa647bb21c3d86d353aa9 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:49:23 +0000
Subject: [PATCH 074/455] Add plot for flowml train scores

---
 modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 modules/flowmldetection/plot_train_score.py

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
new file mode 100644
index 0000000000..0b5b5b72ba
--- /dev/null
+++ b/modules/flowmldetection/plot_train_score.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import re
+import sys
+
+def plot_log_data(file_path):
+    # Read the log data from the file
+    with open(file_path, 'r') as file:
+        log_data = file.read()
+
+    # Define regex pattern to extract relevant data from each line
+    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
+
+    # Parse the log file
+    data = re.findall(pattern, log_data)
+
+    # Convert data to a DataFrame
+    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
+    df = df.astype({
+        "Background": int,
+        "Benign": int,
+        "Malicious": int,
+        "Total labels": float,
+        "Score": float
+    })
+
+    # Plotting the values
+    fig, ax1 = plt.subplots(figsize=(10, 6))
+
+    # Plotting Score on the left y-axis
+    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
+    ax1.set_xlabel('Index')
+    ax1.set_ylabel('Score', color='tab:blue')
+    ax1.tick_params(axis='y', labelcolor='tab:blue')
+
+    # Create the second y-axis for the Total labels
+    ax2 = ax1.twinx()
+    ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
+    ax2.set_ylabel('Total labels', color='tab:red')
+    ax2.tick_params(axis='y', labelcolor='tab:red')
+
+    # Adding title and legend
+    plt.title('Log Data Visualization')
+    fig.tight_layout()
+
+    # Save plot to a PNG file
+    plt.savefig('log_data_plot_with_two_scales.png')
+
+    # Display the plot
+    plt.show()
+
+# Make sure the file path is passed as an argument
+if len(sys.argv) < 2:
+    print("Please provide the path to the log file as a parameter.")
+else:
+    plot_log_data(sys.argv[1])

From 2966b1497c0fef94d4f9daccecfd1d5a9fd66691 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:04 +0000
Subject: [PATCH 075/455] Add a log file to store the training data output

---
 modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index d8e9ada27c..f9a303c1ba 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -68,12 +68,29 @@ def init(self):
         self.model_path = "./modules/flowmldetection/model.bin"
         self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
+        # Initialize the training log file
+        self.training_log_path = "./modules/flowmldetection/training.log"
+        with open(self.training_log_path, "w") as log_file:
+            log_file.write("Training Log Initialized\n")
+
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        # This is the global label in the configuration,
+        # in case the flows do not have a label themselves
         self.label = conf.label()
 
-    def train(self):
+    def write_to_training_log(self, message: str):
+        """
+        Write a message to the training log file.
+        """
+        try:
+            with open(self.training_log_path, "a") as log_file:
+                log_file.write(message + "\n")
+        except Exception as e:
+            self.print(f"Error writing to training log: {e}", 0, 1)
+
+    def train(self, sum_labeled_flows):
         """
         Train a model based on the flows we receive and the labels
         """

From f817b6dcfb79c98ca770649447d788ad7bf0f50f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:32 +0000
Subject: [PATCH 076/455] Store data in the log file of training

---
 modules/flowmldetection/flowmldetection.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f9a303c1ba..e97f4de535 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -137,9 +137,13 @@ def train(self, sum_labeled_flows):
             # Store the models on disk
             self.store_model()
 
+            # Log training information
+            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
+            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+            self.write_to_training_log("Error occurred during training.")
 
     def process_features(self, dataset):
         """

From 656264d4ddf93b7f3588202b0f88394a5fae4ca4 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:53 +0000
Subject: [PATCH 077/455] better comments

---
 modules/flowmldetection/flowmldetection.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e97f4de535..3aa030790f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -59,10 +59,9 @@ def init(self):
         self.minimum_labels_to_start_train = 50
         # Minum amount of new labels needed to retrain
         self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained
+        # The number of flows when last trained. Used internally only to know
+        # when to retrain
         self.last_number_of_flows_when_trained = 0
-        # To plot the scores of training
-        # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
         self.model_path = "./modules/flowmldetection/model.bin"

From e33862c2792b964556310abe33b938ca6864d9e1 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:51:30 +0000
Subject: [PATCH 078/455] Fix issue not dropping detailed labels

---
 modules/flowmldetection/flowmldetection.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3aa030790f..4b05c9b47a 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -94,23 +94,19 @@ def train(self, sum_labeled_flows):
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Get the flows from the DB
-            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
-            # Convert to pandas df
-            # self.flows = pd.DataFrame(self.flows)
-            # Process the features
-            # X_flow = self.process_features(self.flows)
-
             # Create X_flow with the current flows minus the label
-            X_flow = self.flows.drop("label", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.label)
+            X_flow = self.flows.drop("ground_truth_label", axis=1)
+            # Drop the detailed labels
+            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
             # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
 
             # Normalize this batch of data so far. This can get progressivle slow
             X_flow = self.scaler.fit_transform(X_flow)
 
+
             # Train
             try:
                 self.clf.partial_fit(

From ce583a878fa64066e593cb802aae70057db81122 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:51:53 +0000
Subject: [PATCH 079/455] Fix issue that not all labels sere given to the
 partial fit

---
 modules/flowmldetection/flowmldetection.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4b05c9b47a..f12bfaaa66 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -109,8 +109,9 @@ def train(self, sum_labeled_flows):
 
             # Train
             try:
+                # Online incremental learning
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malicious", "Benign"]
+                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")

From 8ec673f039e86599b1260e3a97d7658c0aa81ac5 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:52:08 +0000
Subject: [PATCH 080/455] count partial labels in this epoch

---
 modules/flowmldetection/flowmldetection.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f12bfaaa66..0fffda271b 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -106,6 +106,12 @@ def train(self, sum_labeled_flows):
             # Normalize this batch of data so far. This can get progressivle slow
             X_flow = self.scaler.fit_transform(X_flow)
 
+            # Count the number of labels of each type in this epoc
+            epoch_label_counts = {
+                "Background": (y_flow == "Background").sum(),
+                "Malicious": (y_flow == "Malicious").sum(),
+                "Benign": (y_flow == "Benign").sum(),
+            }
 
             # Train
             try:

From d4b39eea28c5ff30c1a5ee10ec7c3e874cbaa5bf Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:55:09 +0000
Subject: [PATCH 081/455] Dont print training in screen

---
 modules/flowmldetection/flowmldetection.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 0fffda271b..f374c2926f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -126,15 +126,8 @@ def train(self, sum_labeled_flows):
             # See score so far in training
             score = self.clf.score(X_flow, y_flow)
 
-            # To debug the training score
-            # self.scores.append(score)
-
-            self.print(f"	Training Score: {score}", 0, 1)
-            # self.print(f'    Model Parameters: {self.clf.coef_}')
-
-            # Debug code to store a plot in a png of the scores
-            # plt.plot(self.scores)
-            # plt.savefig('train-scores.png')
+            #self.print(f"	Training Score: {score}", 1, 0)
+            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
 
             # Store the models on disk
             self.store_model()

From a2d50c96523ce3f3e344e813286b0653854007cf Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:55:28 +0000
Subject: [PATCH 082/455] Add function to write to train log

---
 modules/flowmldetection/flowmldetection.py | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f374c2926f..679e7c0cc9 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -247,28 +247,28 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_training_flows(self):
+    def process_training_flows(self, last_number_of_flows_when_trained):
         """
-        Process all the flows in the DB
+        Process only the new flows in the DB since the last training.
         Store the pandas df in self.flows
         """
         try:
+            # Ensure the index is an integer
+            if last_number_of_flows_when_trained is None:
+                last_number_of_flows_when_trained = 0
+            else:
+                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+
             # We get all the flows so far
-            # because this retraining happens in batches
             flows = self.db.get_all_flows()
-            # Check how many different labels are in the DB
-            # We need both normal and malware
+            # Only process new flows since last training
+            new_flows = flows[last_number_of_flows_when_trained:]
+
+            # Check how many **different** labels are in the DB
             labels = self.db.get_labels()
             if len(labels) == 1:
-                # Only 1 label has flows
-                # There are not enough different labels, so insert two flows
-                # that are fake but representative of a normal and malware flow
-                # they are only for the training process
-                # At least 1 flow of each label is required
-
-                # These flows should be in the same format as the ones in the DB. 
-                # Which means the satate is still SF, S0, etc.
-                flows.append(
+                # Insert fake flows for both classes if needed
+                new_flows.append(
                     {
                         "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",

From 7e6325dab56e081fbb88ec996572fa4bea30e464 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:57:27 +0000
Subject: [PATCH 083/455] Fix label in dummy flow

---
 modules/flowmldetection/flowmldetection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 679e7c0cc9..95c9b82a74 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 25517,
                         "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malicious",
+                        "ground_truth_label": "Malicious",
                         "module_labels": {
                             "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
-                flows.append(
+                new_flows.append(
                     {
                         "starttime": 1382355032.706468,
                         "dur": "10.896695",

From 683d7c17e081820b4df383742c7d481442801188 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:57:39 +0000
Subject: [PATCH 084/455] Fix dummy flow

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 95c9b82a74..5ea48fbc40 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 100,
                         "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Benign",
+                        "ground_truth_label": "Benign",
                         "module_labels": {
                             "flowalerts-long-connection": "Benign"
                         },

From 26a1482c18bdc02ed46b815b60ba720200fafa8e Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:58:28 +0000
Subject: [PATCH 085/455] Rename variable

---
 modules/flowmldetection/flowmldetection.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 5ea48fbc40..ff68b8a270 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         },
                     }
                 )
-                # If there are enough flows, we dont insert them anymore
 
             # Convert to pandas df
-            df_flows = pd.DataFrame(flows)
+            df_flows = pd.DataFrame(new_flows)
 
             # Process features
             df_flows = self.process_features(df_flows)
@@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained):
             # Update the flow to the processed version
             self.flows = df_flows
         except Exception:
-            # Stop the timer
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 

From 34b754a257ce29bc1abe83c883dcd9b6a4076e35 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:00:32 +0000
Subject: [PATCH 086/455] Fix dummy flow label

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index ff68b8a270..6b41b40298 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -356,7 +356,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "dir_",
                 "endtime",
                 "flow_source",
-                "ground_truth_label",  # todo now we can use them
+                "ground_truth_label",
                 "detailed_ground_truth_label",
             ]
             # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.

From 54f958d42542ed7041fbe43584deb422ac46c591 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:00:47 +0000
Subject: [PATCH 087/455] Pass values to train function

---
 modules/flowmldetection/flowmldetection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6b41b40298..4d66aab855 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -521,9 +521,9 @@ def main(self):
                         )
                         # Process all flows in the DB and make them ready
                         # for pandas
-                        self.process_training_flows()
+                        self.process_training_flows(self.last_number_of_flows_when_trained)
                         # Train an algorithm
-                        self.train()
+                        self.train(sum_labeled_flows)
                         self.last_number_of_flows_when_trained = sum_labeled_flows
 
             elif self.mode == "test":

From a9236e6297c888d029bab09ffecb7270c0c9914a Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:01:47 +0000
Subject: [PATCH 088/455] import os

---
 modules/flowmldetection/flowmldetection.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4d66aab855..766178e127 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import os
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils

From 3fe1eaf3d4d4a7446dfee38749eb7349254e38ae Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:02:15 +0000
Subject: [PATCH 089/455] Get issue of total flows zero

---
 slips_files/core/database/database_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index e8ca3aaf62..892b923b4a 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -661,7 +661,8 @@ def add_software_to_profile(self, *args, **kwargs):
         return self.rdb.add_software_to_profile(*args, **kwargs)
 
     def get_total_flows(self, *args, **kwargs):
-        return int(self.rdb.get_total_flows(*args, **kwargs))
+        total_flows = self.rdb.get_total_flows(*args, **kwargs)
+        return int(total_flows) if total_flows is not None else 0
 
     def increment_processed_flows(self, *args, **kwargs):
         return self.rdb.increment_processed_flows(*args, **kwargs)

From 73a19e5a500dd615b7e991733a471a3e9ec9aa6c Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:02:32 +0000
Subject: [PATCH 090/455] Add comments

---
 slips_files/core/database/database_manager.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 892b923b4a..6dd1d9952e 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -879,7 +879,10 @@ def get_flow(self, *args, **kwargs):
         """returns the raw flow as read from the log file"""
         return self.sqlite.get_flow(*args, **kwargs)
 
-    def add_flow(self, flow, profileid: str, twid: str, label="benign"):
+    def add_flow(self, flow, profileid: str, twid: str, label="Benign"):
+        """
+        Just in case, by default if there are no labels in the flow, we consider it Benign
+        """
         # stores it in the db
         self.sqlite.add_flow(flow, profileid, twid, label=label)
         # handles the channels and labels etc.

From 3e3443af1b2ec4c6d91acf4ed69c76a6928696b7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:02:51 +0000
Subject: [PATCH 091/455] Rename var name to be more clear

---
 slips_files/core/profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py
index a05557b9f2..c0a4261891 100644
--- a/slips_files/core/profiler.py
+++ b/slips_files/core/profiler.py
@@ -119,7 +119,7 @@ def read_configuration(self):
         self.local_whitelist_path = conf.local_whitelist_path()
         self.timeformat = conf.ts_format()
         self.analysis_direction = conf.analysis_direction()
-        self.label = conf.label()
+        self.configuration_label = conf.label()
         self.width = conf.get_tw_width_as_float()
         self.client_ips: List[
             Union[IPv4Network, IPv6Network, IPv4Address, IPv6Address]

From 4c3c3149d67b1dcf2d573a4879fcaab0078f971f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:10 +0000
Subject: [PATCH 092/455] Rename var name

---
 slips_files/core/profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py
index c0a4261891..42bf3355e2 100644
--- a/slips_files/core/profiler.py
+++ b/slips_files/core/profiler.py
@@ -377,7 +377,7 @@ def store_features_going_in(self, profileid: str, twid: str, flow):
             flow,
             profileid=profileid,
             twid=twid,
-            label=self.label,
+            label=self.configuration_label,
         )
         self.db.mark_profile_tw_as_modified(profileid, twid, "")
 

From 18b7544ce9c6554b1bf95c4d7d19458df01f4105 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:31 +0000
Subject: [PATCH 093/455] Fix processeed flows being zero

---
 slips/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slips/main.py b/slips/main.py
index bd7890f5bc..d960ce318b 100644
--- a/slips/main.py
+++ b/slips/main.py
@@ -414,7 +414,7 @@ def get_analyzed_flows_percentage(self) -> str:
             self.total_flows = self.db.get_total_flows()
 
         flows_percentage = int(
-            (self.db.get_processed_flows_so_far() / self.total_flows) * 100
+            (self.db.get_processed_flows_so_far() / self.total_flows) * 100 if self.total_flows != 0 else 0
         )
         return f"Analyzed Flows: {green(flows_percentage)}{green('%')}. "
 

From c221fe75a1a8027f86a35e8080165d37dde8da97 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:53 +0000
Subject: [PATCH 094/455] Delete old comments

---
 modules/flowmldetection/flowmldetection.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 766178e127..6c3bfc1275 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -28,10 +28,6 @@
     Method,
 )
 
-# Only for debbuging
-# from matplotlib import pyplot as plt
-
-
 # This horrible hack is only to stop sklearn from printing those warnings
 def warn(*args, **kwargs):
     pass

From 320e0fedf1ebed269a1c369e6716bb1440a94eca Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:13:22 +0000
Subject: [PATCH 095/455] Fix plots

---
 modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++-----
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 0b5b5b72ba..359df04eff 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -2,6 +2,8 @@
 import matplotlib.pyplot as plt
 import re
 import sys
+import argparse
+import os
 
 def plot_log_data(file_path):
     # Read the log data from the file
@@ -24,33 +26,59 @@ def plot_log_data(file_path):
         "Score": float
     })
 
+    # Get the directory of the log file to store the plot in the same folder
+    dir_name = os.path.dirname(file_path)
+    plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png')
+
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))
 
-    # Plotting Score on the left y-axis
+    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
     ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
     ax1.set_xlabel('Index')
     ax1.set_ylabel('Score', color='tab:blue')
+    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
     ax1.tick_params(axis='y', labelcolor='tab:blue')
 
-    # Create the second y-axis for the Total labels
+    # Create the second y-axis for the Background, Benign, Malicious, Total labels
     ax2 = ax1.twinx()
+    ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
+    ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
+    ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
     ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
-    ax2.set_ylabel('Total labels', color='tab:red')
+    ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red')
+    
+    # Set appropriate scale for right y-axis based on the data
+    ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max())
     ax2.tick_params(axis='y', labelcolor='tab:red')
 
     # Adding title and legend
     plt.title('Log Data Visualization')
     fig.tight_layout()
 
-    # Save plot to a PNG file
-    plt.savefig('log_data_plot_with_two_scales.png')
+    # Adding the legend with increased space for readability
+    ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
+    ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small')
+
+    # Increase right margin for better readability of legend
+    plt.subplots_adjust(right=0.75)
+
+    # Save plot to the same folder as the log file
+    plt.savefig(plot_file)
 
     # Display the plot
     plt.show()
 
-# Make sure the file path is passed as an argument
-if len(sys.argv) < 2:
-    print("Please provide the path to the log file as a parameter.")
-else:
-    plot_log_data(sys.argv[1])
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
+    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
+    
+    # Handle -h / --help
+    args = parser.parse_args()
+
+    # Call the function to process the log file
+    plot_log_data(args.log_file)
+
+if __name__ == "__main__":
+    main()

From 1adc33a6d6de83ef13cad648ea6ccfb9f6ceda02 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:14:58 +0000
Subject: [PATCH 096/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 359df04eff..c7f374a7fe 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -40,18 +40,21 @@ def plot_log_data(file_path):
     ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
     ax1.tick_params(axis='y', labelcolor='tab:blue')
 
-    # Create the second y-axis for the Background, Benign, Malicious, Total labels
+    # Create the second y-axis for the Background, Benign, Malicious
     ax2 = ax1.twinx()
     ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
     ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
     ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
-    ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
-    ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red')
+    ax2.set_ylabel('Background, Benign, Malicious', color='tab:red')
     
     # Set appropriate scale for right y-axis based on the data
-    ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max())
+    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
     ax2.tick_params(axis='y', labelcolor='tab:red')
 
+    # Annotating Total labels as text on the plot
+    for i, value in enumerate(df["Total labels"]):
+        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
+
     # Adding title and legend
     plt.title('Log Data Visualization')
     fig.tight_layout()

From 010fbcda3c6183a3a309726519519ffcd0b61927 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:16:23 +0000
Subject: [PATCH 097/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index c7f374a7fe..4099c47c1e 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -42,10 +42,10 @@ def plot_log_data(file_path):
 
     # Create the second y-axis for the Background, Benign, Malicious
     ax2 = ax1.twinx()
-    ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
-    ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
-    ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
-    ax2.set_ylabel('Background, Benign, Malicious', color='tab:red')
+    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
+    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
+    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
+    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
     
     # Set appropriate scale for right y-axis based on the data
     ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
@@ -56,7 +56,7 @@ def plot_log_data(file_path):
         ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
 
     # Adding title and legend
-    plt.title('Log Data Visualization')
+    plt.title('Training performance')
     fig.tight_layout()
 
     # Adding the legend with increased space for readability

From 978eaa02e2d48e6d27ab1c579a90b8a21b666b41 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:24:43 +0000
Subject: [PATCH 098/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 4099c47c1e..8437e968ac 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -59,12 +59,12 @@ def plot_log_data(file_path):
     plt.title('Training performance')
     fig.tight_layout()
 
-    # Adding the legend with increased space for readability
-    ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
-    ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small')
+    # Move both legends further to the right
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
 
     # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.75)
+    plt.subplots_adjust(right=0.7)
 
     # Save plot to the same folder as the log file
     plt.savefig(plot_file)

From 3571750a84fc29ee775f42eb9b90851818defa56 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:02:34 +0000
Subject: [PATCH 099/455] Plot testing performance from a log

---
 .../plot_testing_performance.py               | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 modules/flowmldetection/plot_testing_performance.py

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
new file mode 100644
index 0000000000..a38c7f0598
--- /dev/null
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -0,0 +1,89 @@
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+
+def process_file(file_path):
+    # Initialize the counters for the values
+    FPR_values = []
+    FNR_values = []
+    TNR_values = []
+    TPR_values = []
+    F1_values = []
+    accuracy_values = []
+    precision_values = []
+    MCC_values = []
+    recall_values = []
+    
+    # Read the file and extract the data
+    with open(file_path, 'r') as file:
+        for line in file:
+            if "TP:" in line:
+                # Extract the values from the line
+                parts = line.split(',')
+                TP = int(parts[0].split(':')[1].strip())
+                TN = int(parts[1].split(':')[1].strip())
+                FP = int(parts[2].split(':')[1].strip())
+                FN = int(parts[3].split(':')[1].strip())
+
+                # Calculate metrics
+                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
+                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
+                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
+                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
+                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
+                Recall = TPR  # Recall is the same as TPR
+                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
+                Accuracy = (TP + TN) / (TP + TN + FP + FN)
+                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
+                
+                # Append the values to the respective lists
+                FPR_values.append(FPR)
+                FNR_values.append(FNR)
+                TNR_values.append(TNR)
+                TPR_values.append(TPR)
+                F1_values.append(F1)
+                accuracy_values.append(Accuracy)
+                precision_values.append(Precision)
+                MCC_values.append(MCC)
+                recall_values.append(Recall)
+    
+    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
+
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+    # Create the plot
+    plt.figure(figsize=(12, 8))
+    
+    # Plot each metric
+    plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o')
+    plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o')
+    plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o')
+    plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o')
+    plt.plot(F1_values, label='F1 Score', marker='o')
+    plt.plot(accuracy_values, label='Accuracy', marker='o')
+    plt.plot(precision_values, label='Precision', marker='o')
+    plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
+    plt.plot(recall_values, label='Recall (TPR)', marker='o')
+    
+    # Add labels and title
+    plt.xlabel('Index')
+    plt.ylabel('Metric Value')
+    plt.title('Evaluation Metrics Over Time')
+    
+    # Add a legend
+    plt.legend()
+    
+    # Save the plot as a PNG file
+    plt.savefig('metrics_plot.png')
+    plt.close()
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <file_path>")
+        sys.exit(1)
+    
+    file_path = sys.argv[1]
+    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+
+if __name__ == "__main__":
+    main()

From 1c0ea51fad5afbd9753a1d52c5369baca086a7d3 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:04:32 +0000
Subject: [PATCH 100/455] Fix the plot

---
 modules/flowmldetection/plot_testing_performance.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index a38c7f0598..fac0acd64a 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
     plt.plot(recall_values, label='Recall (TPR)', marker='o')
     
+    # Set logarithmic scale on the y-axis
+    plt.yscale('log')
+    
     # Add labels and title
     plt.xlabel('Index')
-    plt.ylabel('Metric Value')
-    plt.title('Evaluation Metrics Over Time')
+    plt.ylabel('Metric Value (Log Scale)')
+    plt.title('Evaluation Metrics Over Time (Log Scale)')
     
     # Add a legend
     plt.legend()
     
     # Save the plot as a PNG file
-    plt.savefig('metrics_plot.png')
+    plt.savefig('metrics_plot_log_scale.png')
     plt.close()
 
 def main():

From 1bcca14a5068fbb68c8a38962f7b995314cc65d7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:12:40 +0000
Subject: [PATCH 101/455] Fix the plots

---
 .../plot_testing_performance.py               | 76 ++++++++++++++-----
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index fac0acd64a..5581c72cd4 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -50,33 +50,66 @@ def process_file(file_path):
     return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
 
 def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
-    # Create the plot
-    plt.figure(figsize=(12, 8))
+    # Separate the values into two groups based on their proximity to 0 or 1
+    close_to_0 = {
+        'FPR': [], 'FNR': []
+    }
+    close_to_1 = {
+        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
+    }
     
-    # Plot each metric
-    plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o')
-    plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o')
-    plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o')
-    plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o')
-    plt.plot(F1_values, label='F1 Score', marker='o')
-    plt.plot(accuracy_values, label='Accuracy', marker='o')
-    plt.plot(precision_values, label='Precision', marker='o')
-    plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
-    plt.plot(recall_values, label='Recall (TPR)', marker='o')
+    # Categorize the metrics into two groups
+    for i in range(len(FPR_values)):
+        close_to_0['FPR'].append(FPR_values[i])
+        close_to_0['FNR'].append(FNR_values[i])
+        
+        close_to_1['TNR'].append(TNR_values[i])
+        close_to_1['TPR'].append(TPR_values[i])
+        close_to_1['F1'].append(F1_values[i])
+        close_to_1['accuracy'].append(accuracy_values[i])
+        close_to_1['precision'].append(precision_values[i])
+        close_to_1['MCC'].append(MCC_values[i])
+        close_to_1['recall'].append(recall_values[i])
+
+    # Plot metrics for values close to 0
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png')
     
-    # Set logarithmic scale on the y-axis
-    plt.yscale('log')
+    # Plot metrics for values close to 1
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+
+def plot_single_group(metrics_dict, output_filename):
+    plt.figure(figsize=(12, 8))
     
-    # Add labels and title
+    # Only plot the metrics that exist in the dictionary
+    if 'FPR' in metrics_dict:
+        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
+    if 'FNR' in metrics_dict:
+        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
+    if 'TNR' in metrics_dict:
+        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
+    if 'TPR' in metrics_dict:
+        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
+    if 'F1' in metrics_dict:
+        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
+    if 'accuracy' in metrics_dict:
+        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
+    if 'precision' in metrics_dict:
+        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
+    if 'MCC' in metrics_dict:
+        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
+    if 'recall' in metrics_dict:
+        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
+
+    # Apply log scale by default
+    plt.yscale('log')
+
     plt.xlabel('Index')
-    plt.ylabel('Metric Value (Log Scale)')
-    plt.title('Evaluation Metrics Over Time (Log Scale)')
-    
-    # Add a legend
+    plt.ylabel('Metric Value')
+    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
     plt.legend()
     
-    # Save the plot as a PNG file
-    plt.savefig('metrics_plot_log_scale.png')
+    # Save the plot
+    plt.savefig(output_filename)
     plt.close()
 
 def main():
@@ -85,6 +118,7 @@ def main():
         sys.exit(1)
     
     file_path = sys.argv[1]
+    
     FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
     plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
 

From ab4bcd82169f802615ea28755e6735a0c611e2e7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:16:50 +0000
Subject: [PATCH 102/455] Fix plot

---
 .../plot_testing_performance.py               | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 5581c72cd4..8f9e12cd86 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png')
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1
     plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
 
-def plot_single_group(metrics_dict, output_filename):
+    # Print the final values
+    print("\nFinal Metric Values:")
+    print(f"Final FPR: {FPR_values[-1]:.4f}")
+    print(f"Final FNR: {FNR_values[-1]:.4f}")
+    print(f"Final TNR: {TNR_values[-1]:.4f}")
+    print(f"Final TPR: {TPR_values[-1]:.4f}")
+    print(f"Final F1 Score: {F1_values[-1]:.4f}")
+    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
+    print(f"Final Precision: {precision_values[-1]:.4f}")
+    print(f"Final MCC: {MCC_values[-1]:.4f}")
+    print(f"Final Recall: {recall_values[-1]:.4f}")
+
+def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     plt.figure(figsize=(12, 8))
     
     # Only plot the metrics that exist in the dictionary
@@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename):
     # Apply log scale by default
     plt.yscale('log')
 
+    # If the plot is close to 0, set custom ticks
+    if is_close_to_0:
+        # Manually set more Y-ticks for better visibility
+        plt.ylim(0.0001, 1)  # Set Y-axis limits between 0.0001 and 1
+        plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1'])  # Adjust Y-ticks
+
     plt.xlabel('Index')
     plt.ylabel('Metric Value')
     plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')

From b7e0c6f6b4cecc6a446dc322e320183999092fb6 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:20:22 +0000
Subject: [PATCH 103/455] Fix plots

---
 modules/flowmldetection/flowmldetection.py | 709 +++++----------------
 1 file changed, 143 insertions(+), 566 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6c3bfc1275..37f0761109 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,566 +1,143 @@
-# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
-from typing import Optional
-
-# SPDX-License-Identifier: GPL-2.0-only
-import numpy
-from sklearn.linear_model import SGDClassifier
-from sklearn.preprocessing import StandardScaler
-import pickle
-import pandas as pd
-import json
-import traceback
-import warnings
-import os
-
-from slips_files.common.parsers.config_parser import ConfigParser
-from slips_files.common.slips_utils import utils
-from slips_files.common.abstracts.module import IModule
-from slips_files.core.structures.evidence import (
-    Evidence,
-    ProfileID,
-    TimeWindow,
-    Attacker,
-    ThreatLevel,
-    EvidenceType,
-    IoCType,
-    Direction,
-    Victim,
-    Method,
-)
-
-# This horrible hack is only to stop sklearn from printing those warnings
-def warn(*args, **kwargs):
-    pass
-
-
-warnings.warn = warn
-
-
-class FlowMLDetection(IModule):
-    # Name: short name of the module. Do not use spaces
-    name = "Flow ML Detection"
-    description = (
-        "Train or test a Machine Learning model to detect malicious flows"
-    )
-    authors = ["Sebastian Garcia"]
-
-    def init(self):
-        # Subscribe to the channel
-        self.c1 = self.db.subscribe("new_flow")
-        self.channels = {"new_flow": self.c1}
-        self.fieldseparator = self.db.get_field_separator()
-        # Set the output queue of our database instance
-        # Read the configuration
-        self.read_configuration()
-        # Minum amount of new labels needed to start the train
-        self.minimum_labels_to_start_train = 50
-        # Minum amount of new labels needed to retrain
-        self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained. Used internally only to know
-        # when to retrain
-        self.last_number_of_flows_when_trained = 0
-        # The scaler trained during training and to use during testing
-        self.scaler = StandardScaler()
-        self.model_path = "./modules/flowmldetection/model.bin"
-        self.scaler_path = "./modules/flowmldetection/scaler.bin"
-
-        # Initialize the training log file
-        self.training_log_path = "./modules/flowmldetection/training.log"
-        with open(self.training_log_path, "w") as log_file:
-            log_file.write("Training Log Initialized\n")
-
-    def read_configuration(self):
-        conf = ConfigParser()
-        self.mode = conf.get_ml_mode()
-        # This is the global label in the configuration,
-        # in case the flows do not have a label themselves
-        self.label = conf.label()
-
-    def write_to_training_log(self, message: str):
-        """
-        Write a message to the training log file.
-        """
-        try:
-            with open(self.training_log_path, "a") as log_file:
-                log_file.write(message + "\n")
-        except Exception as e:
-            self.print(f"Error writing to training log: {e}", 0, 1)
-
-    def train(self, sum_labeled_flows):
-        """
-        Train a model based on the flows we receive and the labels
-        """
-        try:
-            # Create X_flow with the current flows minus the label
-            X_flow = self.flows.drop("ground_truth_label", axis=1)
-            # Drop the detailed labels
-            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
-            # Drop the module_labels
-            X_flow = X_flow.drop("module_labels", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
-
-            # Normalize this batch of data so far. This can get progressivle slow
-            X_flow = self.scaler.fit_transform(X_flow)
-
-            # Count the number of labels of each type in this epoc
-            epoch_label_counts = {
-                "Background": (y_flow == "Background").sum(),
-                "Malicious": (y_flow == "Malicious").sum(),
-                "Benign": (y_flow == "Benign").sum(),
-            }
-
-            # Train
-            try:
-                # Online incremental learning
-                self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
-                )
-            except Exception:
-                self.print("Error while calling clf.train()")
-                self.print(traceback.format_exc(), 0, 1)
-
-            # See score so far in training
-            score = self.clf.score(X_flow, y_flow)
-
-            #self.print(f"	Training Score: {score}", 1, 0)
-            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
-
-            # Store the models on disk
-            self.store_model()
-
-            # Log training information
-            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
-            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
-        except Exception:
-            self.print("Error in train().", 0, 1)
-            self.print(traceback.format_exc(), 0, 1)
-            self.write_to_training_log("Error occurred during training.")
-
-    def process_features(self, dataset):
-        """
-        Discards some features of the dataset and can create new.
-        Clean the dataset
-        """
-        try:
-            # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
-            for proto in to_discard:
-                dataset = dataset[dataset.proto != proto]
-
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
-            if dataset.empty:
-                # DataFrame is empty now, so return empty
-                return dataset
-
-            # For now, discard these
-            to_drop = [
-                "appproto",
-                "daddr",
-                "saddr",
-                "starttime",
-                "type_",
-                "smac",
-                "dmac",
-                "history",
-                "uid",
-                "dir_",
-                "endtime",
-                "flow_source",
-            ]
-            for field in to_drop:
-                try:
-                    dataset = dataset.drop(field, axis=1)
-                except (ValueError, KeyError):
-                    pass
-
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
-            # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: self.db.get_final_state_from_flags(
-                    row["state"], (row["spkts"] + row["dpkts"])
-                ),
-                axis=1,
-            )
-
-            # Convert state to categorical
-            dataset.state = dataset.state.str.replace(
-                r"(^.*Not Established.*$)", "0", regex=True
-            )
-            dataset.state = dataset.state.str.replace(
-                r"(^.*Established.*$)", "1", regex=True
-            )
-
-            # Convert categories to floats
-            dataset.state = dataset.state.astype("float64")
-
-            # Convert proto to categorical. For now we only have few states, so we can hardcode...
-            # We dont use the data to create categories because in testing mode
-            # we dont see all the protocols
-            # Also we dont store the Categorizer because the user can retrain
-            # with its own data.
-            dataset.proto = dataset.proto.str.lower()
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*tcp.*$)", "0", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*udp.*$)", "1", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*icmp.*$)", "2", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*icmp-ipv6.*$)", "3", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*arp.*$)", "4", regex=True
-            )
-
-            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
-            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
-
-            fields_to_convert_to_float = [
-                dataset.proto,
-                dataset.dport,
-                dataset.sport,
-                dataset.dur,
-                dataset.pkts,
-                dataset.spkts,
-                dataset.allbytes,
-                dataset.sbytes,
-                dataset.state,
-            ]
-            for field in fields_to_convert_to_float:
-                try:
-                    field = field.astype("float64")
-                except (ValueError, AttributeError):
-                    pass
-
-            return dataset
-        except Exception:
-            # Stop the timer
-            self.print("Error in process_features()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def process_training_flows(self, last_number_of_flows_when_trained):
-        """
-        Process only the new flows in the DB since the last training.
-        Store the pandas df in self.flows
-        """
-        try:
-            # Ensure the index is an integer
-            if last_number_of_flows_when_trained is None:
-                last_number_of_flows_when_trained = 0
-            else:
-                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
-
-            # We get all the flows so far
-            flows = self.db.get_all_flows()
-            # Only process new flows since last training
-            new_flows = flows[last_number_of_flows_when_trained:]
-
-            # Check how many **different** labels are in the DB
-            labels = self.db.get_labels()
-            if len(labels) == 1:
-                # Insert fake flows for both classes if needed
-                new_flows.append(
-                    {
-                        "starttime": 1594417039.029793,
-                        "dur": "1.9424750804901123",
-                        "saddr": "10.7.10.101",
-                        "sport": "49733",
-                        "daddr": "40.70.224.145",
-                        "dport": "443",
-                        "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 17,
-                        "dpkts": 27,
-                        "sbytes": 25517,
-                        "dbytes": 17247,
-                        "appproto": "ssl",
-                        "ground_truth_label": "Malicious",
-                        "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
-                        },
-                    }
-                )
-                new_flows.append(
-                    {
-                        "starttime": 1382355032.706468,
-                        "dur": "10.896695",
-                        "saddr": "147.32.83.52",
-                        "sport": "47956",
-                        "daddr": "80.242.138.72",
-                        "dport": "80",
-                        "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 1,
-                        "dpkts": 0,
-                        "sbytes": 100,
-                        "dbytes": 67596,
-                        "appproto": "http",
-                        "ground_truth_label": "Benign",
-                        "module_labels": {
-                            "flowalerts-long-connection": "Benign"
-                        },
-                    }
-                )
-
-            # Convert to pandas df
-            df_flows = pd.DataFrame(new_flows)
-
-            # Process features
-            df_flows = self.process_features(df_flows)
-
-            # Update the flow to the processed version
-            self.flows = df_flows
-        except Exception:
-            self.print("Error in process_flows()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def process_flow(self, flow_to_process: dict):
-        """
-        Process one flow. Only used during detection in testing
-        returns the pandas df with the processed flow
-        """
-        try:
-            # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(flow_to_process, index=[0])
-            dflow = self.process_features(raw_flow)
-            if dflow.empty:
-                return None
-            # Update the flow to the processed version
-            return dflow
-        except Exception:
-            # Stop the timer
-            self.print("Error in process_flow()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def detect(self, x_flow) -> Optional[numpy.ndarray]:
-        """
-        Detects the given flow with the current model stored
-        and returns the predection array
-        """
-        try:
-            # clean the flow
-            fields_to_drop = [
-                "label",
-                "module_labels",
-                "uid",
-                "history",
-                "dir_",
-                "endtime",
-                "flow_source",
-                "ground_truth_label",
-                "detailed_ground_truth_label",
-            ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
-            # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
-
-            # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
-            for field in fields_to_drop:
-                try:
-                    x_flow = x_flow.drop(field, axis=1)
-                except (KeyError, ValueError):
-                    pass
-            # Scale the flow
-            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
-            pred: numpy.ndarray = self.clf.predict(x_flow)
-            return pred
-        except Exception as e:
-            self.print(
-                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
-            )
-            self.print(traceback.format_exc(), 0, 1)
-
-    def store_model(self):
-        """
-        Store the trained model on disk
-        """
-        self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open(self.model_path, "wb") as f:
-            data = pickle.dumps(self.clf)
-            f.write(data)
-        with open(self.scaler_path, "wb") as g:
-            data = pickle.dumps(self.scaler)
-            g.write(data)
-
-    def read_model(self):
-        """
-        Read the trained model from disk
-        """
-        try:
-            self.print("Reading the trained model from disk.", 0, 2)
-            with open(self.model_path, "rb") as f:
-                self.clf = pickle.load(f)
-            self.print("Reading the trained scaler from disk.", 0, 2)
-            with open(self.scaler_path, "rb") as g:
-                self.scaler = pickle.load(g)
-        except FileNotFoundError:
-            # If there is no model, create one empty
-            self.print(
-                "There was no model. " "Creating a new empty model.", 0, 2
-            )
-            self.clf = SGDClassifier(
-                warm_start=True, loss="hinge", penalty="l1"
-            )
-        except EOFError:
-            self.print(
-                "Error reading model from disk. "
-                "Creating a new empty model.",
-                0,
-                2,
-            )
-            self.clf = SGDClassifier(
-                warm_start=True, loss="hinge", penalty="l1"
-            )
-
-    def set_evidence_malicious_flow(self, flow: dict, twid: str):
-        confidence: float = 0.1
-        description = (
-            f"Flow with malicious characteristics by ML. Src IP"
-            f" {flow['saddr']}:{flow['sport']} to "
-            f"{flow['daddr']}:{flow['dport']}"
-        )
-        twid_number = int(twid.replace("timewindow", ""))
-        evidence: Evidence = Evidence(
-            evidence_type=EvidenceType.MALICIOUS_FLOW,
-            attacker=Attacker(
-                direction=Direction.SRC,
-                ioc_type=IoCType.IP,
-                value=flow["saddr"],
-            ),
-            victim=Victim(
-                direction=Direction.DST,
-                ioc_type=IoCType.IP,
-                value=flow["daddr"],
-            ),
-            threat_level=ThreatLevel.LOW,
-            confidence=confidence,
-            description=description,
-            profile=ProfileID(ip=flow["saddr"]),
-            timewindow=TimeWindow(twid_number),
-            uid=[flow["uid"]],
-            timestamp=flow["starttime"],
-            method=Method.AI,
-            src_port=flow["sport"],
-            dst_port=flow["dport"],
-        )
-
-        self.db.set_evidence(evidence)
-
-    def shutdown_gracefully(self):
-        # Confirm that the module is done processing
-        if self.mode == "train":
-            self.store_model()
-
-    def pre_main(self):
-        utils.drop_root_privs()
-        # Load the model
-        self.read_model()
-
-    def main(self):
-        if msg := self.get_msg("new_flow"):
-            # When a new flow arrives
-            msg = json.loads(msg["data"])
-            self.twid = msg["twid"]
-            self.profileid = msg["profileid"]
-            self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
-            # flow dict to have them
-            self.flow.update(
-                {
-                    "state": msg["interpreted_state"],
-                    "label": msg["label"],
-                    "module_labels": msg["module_labels"],
-                }
-            )
-
-            if self.mode == "train":
-                # We are training
-
-                # Is the amount in the DB of labels enough to retrain?
-                # Use labeled flows
-                labels = self.db.get_labels()
-                sum_labeled_flows = sum(i[1] for i in labels)
-
-                # The min labels to retrain is the min number of flows 
-                # we should have seen so far in this capture to start training
-                # This is so we dont _start_ training with only 1 flow
-
-                # Once we are over the start minimum, the second condition is 
-                # to force to retrain every a minimum_labels_to_retrain number
-                # of flows. So we dont retrain every 1 flow.
-                if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
-                ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
-                        # So for example we retrain every 50 labels and only when
-                        # we have at least 50 labels
-                        self.print(
-                            f"Training the model with the last group of "
-                            f"flows and labels. Total flows: {sum_labeled_flows}."
-                        )
-                        # Process all flows in the DB and make them ready
-                        # for pandas
-                        self.process_training_flows(self.last_number_of_flows_when_trained)
-                        # Train an algorithm
-                        self.train(sum_labeled_flows)
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
-
-            elif self.mode == "test":
-                # We are testing, which means using the model to detect
-                processed_flow = self.process_flow(self.flow)
-
-                # After processing the flow, it may happen that we
-                # delete icmp/arp/etc so the dataframe can be empty
-                if processed_flow is not None and not processed_flow.empty:
-                    # Predict
-                    pred: numpy.ndarray = self.detect(processed_flow)
-                    if not pred:
-                        # an error occurred
-                        return
-
-                    label = self.flow["label"]
-                    if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
-                        self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            3,
-                        )
-                    if pred[0] == "Malicious":
-                        # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, self.twid)
-                        self.print(
-                            f"Prediction {pred[0]} for label {label}"
-                            f' flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} -> '
-                            f'{self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            2,
-                        )
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+
+def process_file(file_path):
+    # Initialize the counters for the values
+    FPR_values = []
+    FNR_values = []
+    TNR_values = []
+    TPR_values = []
+    F1_values = []
+    accuracy_values = []
+    precision_values = []
+    MCC_values = []
+    recall_values = []
+    
+    # Read the file and extract the data
+    with open(file_path, 'r') as file:
+        for line in file:
+            if "TP:" in line:
+                # Extract the values from the line
+                parts = line.split(',')
+                TP = int(parts[0].split(':')[1].strip())
+                TN = int(parts[1].split(':')[1].strip())
+                FP = int(parts[2].split(':')[1].strip())
+                FN = int(parts[3].split(':')[1].strip())
+
+                # Calculate metrics
+                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
+                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
+                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
+                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
+                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
+                Recall = TPR  # Recall is the same as TPR
+                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
+                Accuracy = (TP + TN) / (TP + TN + FP + FN)
+                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
+                
+                # Append the values to the respective lists
+                FPR_values.append(FPR)
+                FNR_values.append(FNR)
+                TNR_values.append(TNR)
+                TPR_values.append(TPR)
+                F1_values.append(F1)
+                accuracy_values.append(Accuracy)
+                precision_values.append(Precision)
+                MCC_values.append(MCC)
+                recall_values.append(Recall)
+    
+    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
+
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+    # Separate the values into two groups based on their proximity to 0 or 1
+    close_to_0 = {
+        'FPR': [], 'FNR': []
+    }
+    close_to_1 = {
+        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
+    }
+    
+    # Categorize the metrics into two groups
+    for i in range(len(FPR_values)):
+        close_to_0['FPR'].append(FPR_values[i])
+        close_to_0['FNR'].append(FNR_values[i])
+        
+        close_to_1['TNR'].append(TNR_values[i])
+        close_to_1['TPR'].append(TPR_values[i])
+        close_to_1['F1'].append(F1_values[i])
+        close_to_1['accuracy'].append(accuracy_values[i])
+        close_to_1['precision'].append(precision_values[i])
+        close_to_1['MCC'].append(MCC_values[i])
+        close_to_1['recall'].append(recall_values[i])
+
+    # Plot metrics for values close to 0
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
+    
+    # Plot metrics for values close to 1
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+
+    # Print the final values
+    print("\nFinal Metric Values:")
+    print(f"Final FPR: {FPR_values[-1]:.4f}")
+    print(f"Final FNR: {FNR_values[-1]:.4f}")
+    print(f"Final TNR: {TNR_values[-1]:.4f}")
+    print(f"Final TPR: {TPR_values[-1]:.4f}")
+    print(f"Final F1 Score: {F1_values[-1]:.4f}")
+    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
+    print(f"Final Precision: {precision_values[-1]:.4f}")
+    print(f"Final MCC: {MCC_values[-1]:.4f}")
+    print(f"Final Recall: {recall_values[-1]:.4f}")
+
+def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
+    plt.figure(figsize=(12, 8))
+    
+    # Only plot the metrics that exist in the dictionary
+    if 'FPR' in metrics_dict:
+        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
+    if 'FNR' in metrics_dict:
+        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
+    if 'TNR' in metrics_dict:
+        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
+    if 'TPR' in metrics_dict:
+        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
+    if 'F1' in metrics_dict:
+        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
+    if 'accuracy' in metrics_dict:
+        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
+    if 'precision' in metrics_dict:
+        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
+    if 'MCC' in metrics_dict:
+        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
+    if 'recall' in metrics_dict:
+        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
+
+    # Apply log scale by default
+    plt.yscale('log')
+
+    # If the plot is close to 0, set custom ticks
+    if is_close_to_0:
+        # Add more ticks between 0 and 1 (using a logarithmic scale)
+        plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100'])
+
+    plt.xlabel('Index')
+    plt.ylabel('Metric Value')
+    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
+    plt.legend()
+    
+    # Save the plot
+    plt.savefig(output_filename)
+    plt.close()
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <file_path>")
+        sys.exit(1)
+    
+    file_path = sys.argv[1]
+    
+    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+
+if __name__ == "__main__":
+    main()

From 511291517c0ef8a3b791ba1accc72b83363e0425 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:20:52 +0000
Subject: [PATCH 104/455] Fix plots

---
 .../plot_testing_performance.py               | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 8f9e12cd86..69b8c96a8c 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['MCC'].append(MCC_values[i])
         close_to_1['recall'].append(recall_values[i])
 
-    # Plot metrics for values close to 0
+    # Plot metrics for values close to 0 (linear scale)
     plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
     
-    # Plot metrics for values close to 1
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+    # Plot metrics for values close to 1 (log scale)
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")
@@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     if 'recall' in metrics_dict:
         plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
 
-    # Apply log scale by default
-    plt.yscale('log')
+    # If the plot is close to 1, apply log scale
+    if not is_close_to_0:
+        plt.yscale('log')
 
-    # If the plot is close to 0, set custom ticks
+    # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series
     if is_close_to_0:
-        # Manually set more Y-ticks for better visibility
-        plt.ylim(0.0001, 1)  # Set Y-axis limits between 0.0001 and 1
-        plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1'])  # Adjust Y-ticks
+        min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR']))
+        max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR']))
+        
+        # Avoid log(0), so set the minimum limit a little higher than zero
+        if min_val == 0:
+            min_val = 1e-4  # Avoid zero values on the logarithmic scale
+
+        plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From 17a9c9a356bc8cf489c80dcc736124a3dc22b7b9 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:23:02 +0000
Subject: [PATCH 105/455] Fix plots

---
 modules/flowmldetection/plot_testing_performance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 69b8c96a8c..de4ada38b3 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
         
         # Avoid log(0), so set the minimum limit a little higher than zero
         if min_val == 0:
-            min_val = 1e-4  # Avoid zero values on the logarithmic scale
+            min_val = 1e-8  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From 8561011f8b5d0d3a50932d6f1ff16d90b9986a18 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:25:58 +0000
Subject: [PATCH 106/455] Change plot names

---
 modules/flowmldetection/plot_testing_performance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index de4ada38b3..1b4152c6eb 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0 (linear scale)
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
+    plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")

From 75db21d8225a7e8ad9ae41e33b1f64f6e1ccf598 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:26:09 +0000
Subject: [PATCH 107/455] Rename file

---
 .../{plot_train_score.py => plot_train_performance.py}          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename modules/flowmldetection/{plot_train_score.py => plot_train_performance.py} (97%)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_performance.py
similarity index 97%
rename from modules/flowmldetection/plot_train_score.py
rename to modules/flowmldetection/plot_train_performance.py
index 8437e968ac..80e13e9515 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -28,7 +28,7 @@ def plot_log_data(file_path):
 
     # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png')
+    plot_file = os.path.join(dir_name, 'performance_metrics_training.png')
 
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))

From 4a16fd6ebe7893df77dd14898c3270a989193e21 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:31:32 +0000
Subject: [PATCH 108/455] Recover good flowmldetection deleted by mistake

---
 modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++-----
 1 file changed, 566 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 37f0761109..5e4e9aa462 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,143 +1,566 @@
-import matplotlib.pyplot as plt
-import sys
-import numpy as np
-
-def process_file(file_path):
-    # Initialize the counters for the values
-    FPR_values = []
-    FNR_values = []
-    TNR_values = []
-    TPR_values = []
-    F1_values = []
-    accuracy_values = []
-    precision_values = []
-    MCC_values = []
-    recall_values = []
-    
-    # Read the file and extract the data
-    with open(file_path, 'r') as file:
-        for line in file:
-            if "TP:" in line:
-                # Extract the values from the line
-                parts = line.split(',')
-                TP = int(parts[0].split(':')[1].strip())
-                TN = int(parts[1].split(':')[1].strip())
-                FP = int(parts[2].split(':')[1].strip())
-                FN = int(parts[3].split(':')[1].strip())
-
-                # Calculate metrics
-                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
-                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
-                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
-                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
-                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
-                Recall = TPR  # Recall is the same as TPR
-                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
-                Accuracy = (TP + TN) / (TP + TN + FP + FN)
-                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
-                
-                # Append the values to the respective lists
-                FPR_values.append(FPR)
-                FNR_values.append(FNR)
-                TNR_values.append(TNR)
-                TPR_values.append(TPR)
-                F1_values.append(F1)
-                accuracy_values.append(Accuracy)
-                precision_values.append(Precision)
-                MCC_values.append(MCC)
-                recall_values.append(Recall)
-    
-    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
-
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
-    # Separate the values into two groups based on their proximity to 0 or 1
-    close_to_0 = {
-        'FPR': [], 'FNR': []
-    }
-    close_to_1 = {
-        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
-    }
-    
-    # Categorize the metrics into two groups
-    for i in range(len(FPR_values)):
-        close_to_0['FPR'].append(FPR_values[i])
-        close_to_0['FNR'].append(FNR_values[i])
-        
-        close_to_1['TNR'].append(TNR_values[i])
-        close_to_1['TPR'].append(TPR_values[i])
-        close_to_1['F1'].append(F1_values[i])
-        close_to_1['accuracy'].append(accuracy_values[i])
-        close_to_1['precision'].append(precision_values[i])
-        close_to_1['MCC'].append(MCC_values[i])
-        close_to_1['recall'].append(recall_values[i])
-
-    # Plot metrics for values close to 0
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
-    
-    # Plot metrics for values close to 1
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
-
-    # Print the final values
-    print("\nFinal Metric Values:")
-    print(f"Final FPR: {FPR_values[-1]:.4f}")
-    print(f"Final FNR: {FNR_values[-1]:.4f}")
-    print(f"Final TNR: {TNR_values[-1]:.4f}")
-    print(f"Final TPR: {TPR_values[-1]:.4f}")
-    print(f"Final F1 Score: {F1_values[-1]:.4f}")
-    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
-    print(f"Final Precision: {precision_values[-1]:.4f}")
-    print(f"Final MCC: {MCC_values[-1]:.4f}")
-    print(f"Final Recall: {recall_values[-1]:.4f}")
-
-def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
-    plt.figure(figsize=(12, 8))
-    
-    # Only plot the metrics that exist in the dictionary
-    if 'FPR' in metrics_dict:
-        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
-    if 'FNR' in metrics_dict:
-        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
-    if 'TNR' in metrics_dict:
-        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
-    if 'TPR' in metrics_dict:
-        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
-    if 'F1' in metrics_dict:
-        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
-    if 'accuracy' in metrics_dict:
-        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
-    if 'precision' in metrics_dict:
-        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
-    if 'MCC' in metrics_dict:
-        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
-    if 'recall' in metrics_dict:
-        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
-
-    # Apply log scale by default
-    plt.yscale('log')
-
-    # If the plot is close to 0, set custom ticks
-    if is_close_to_0:
-        # Add more ticks between 0 and 1 (using a logarithmic scale)
-        plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100'])
-
-    plt.xlabel('Index')
-    plt.ylabel('Metric Value')
-    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
-    plt.legend()
-    
-    # Save the plot
-    plt.savefig(output_filename)
-    plt.close()
-
-def main():
-    if len(sys.argv) != 2:
-        print("Usage: python script.py <file_path>")
-        sys.exit(1)
-    
-    file_path = sys.argv[1]
-    
-    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
-
-if __name__ == "__main__":
-    main()
+# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
+from typing import Optional
+
+# SPDX-License-Identifier: GPL-2.0-only
+import numpy
+from sklearn.linear_model import SGDClassifier
+from sklearn.preprocessing import StandardScaler
+import pickle
+import pandas as pd
+import json
+import traceback
+import warnings
+import os
+
+from slips_files.common.parsers.config_parser import ConfigParser
+from slips_files.common.slips_utils import utils
+from slips_files.common.abstracts.module import IModule
+from slips_files.core.structures.evidence import (
+    Evidence,
+    ProfileID,
+    TimeWindow,
+    Attacker,
+    ThreatLevel,
+    EvidenceType,
+    IoCType,
+    Direction,
+    Victim,
+    Method,
+)
+
+# This horrible hack is only to stop sklearn from printing those warnings
+def warn(*args, **kwargs):
+    pass
+
+
+warnings.warn = warn
+
+
+class FlowMLDetection(IModule):
+    # Name: short name of the module. Do not use spaces
+    name = "Flow ML Detection"
+    description = (
+        "Train or test a Machine Learning model to detect malicious flows"
+    )
+    authors = ["Sebastian Garcia"]
+
+    def init(self):
+        # Subscribe to the channel
+        self.c1 = self.db.subscribe("new_flow")
+        self.channels = {"new_flow": self.c1}
+        self.fieldseparator = self.db.get_field_separator()
+        # Set the output queue of our database instance
+        # Read the configuration
+        self.read_configuration()
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained. Used internally only to know
+        # when to retrain
+        self.last_number_of_flows_when_trained = 0
+        # The scaler trained during training and to use during testing
+        self.scaler = StandardScaler()
+        self.model_path = "./modules/flowmldetection/model.bin"
+        self.scaler_path = "./modules/flowmldetection/scaler.bin"
+
+        # Initialize the training log file
+        self.training_log_path = "./modules/flowmldetection/training.log"
+        with open(self.training_log_path, "w") as log_file:
+            log_file.write("Training Log Initialized\n")
+
+    def read_configuration(self):
+        conf = ConfigParser()
+        self.mode = conf.get_ml_mode()
+        # This is the global label in the configuration,
+        # in case the flows do not have a label themselves
+        self.label = conf.label()
+
+    def write_to_training_log(self, message: str):
+        """
+        Write a message to the training log file.
+        """
+        try:
+            with open(self.training_log_path, "a") as log_file:
+                log_file.write(message + "\n")
+        except Exception as e:
+            self.print(f"Error writing to training log: {e}", 0, 1)
+
+    def train(self, sum_labeled_flows):
+        """
+        Train a model based on the flows we receive and the labels
+        """
+        try:
+            # Create X_flow with the current flows minus the label
+            X_flow = self.flows.drop("ground_truth_label", axis=1)
+            # Drop the detailed labels
+            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
+            # Drop the module_labels
+            X_flow = X_flow.drop("module_labels", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
+
+            # Normalize this batch of data so far. This can get progressivle slow
+            X_flow = self.scaler.fit_transform(X_flow)
+
+            # Count the number of labels of each type in this epoc
+            epoch_label_counts = {
+                "Background": (y_flow == "Background").sum(),
+                "Malicious": (y_flow == "Malicious").sum(),
+                "Benign": (y_flow == "Benign").sum(),
+            }
+
+            # Train
+            try:
+                # Online incremental learning
+                self.clf.partial_fit(
+                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
+                )
+            except Exception:
+                self.print("Error while calling clf.train()")
+                self.print(traceback.format_exc(), 0, 1)
+
+            # See score so far in training
+            score = self.clf.score(X_flow, y_flow)
+
+            #self.print(f"	Training Score: {score}", 1, 0)
+            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
+
+            # Store the models on disk
+            self.store_model()
+
+            # Log training information
+            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
+            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
+        except Exception:
+            self.print("Error in train().", 0, 1)
+            self.print(traceback.format_exc(), 0, 1)
+            self.write_to_training_log("Error occurred during training.")
+
+    def process_features(self, dataset):
+        """
+        Discards some features of the dataset and can create new.
+        Clean the dataset
+        """
+        try:
+            # Discard some type of flows that dont have ports
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
+            for proto in to_discard:
+                dataset = dataset[dataset.proto != proto]
+
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
+            # For now, discard these
+            to_drop = [
+                "appproto",
+                "daddr",
+                "saddr",
+                "starttime",
+                "type_",
+                "smac",
+                "dmac",
+                "history",
+                "uid",
+                "dir_",
+                "endtime",
+                "flow_source",
+            ]
+            for field in to_drop:
+                try:
+                    dataset = dataset.drop(field, axis=1)
+                except (ValueError, KeyError):
+                    pass
+
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
+            # So transform here
+            dataset["state"] = dataset.apply(
+                lambda row: self.db.get_final_state_from_flags(
+                    row["state"], (row["spkts"] + row["dpkts"])
+                ),
+                axis=1,
+            )
+
+            # Convert state to categorical
+            dataset.state = dataset.state.str.replace(
+                r"(^.*Not Established.*$)", "0", regex=True
+            )
+            dataset.state = dataset.state.str.replace(
+                r"(^.*Established.*$)", "1", regex=True
+            )
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
+            # We dont use the data to create categories because in testing mode
+            # we dont see all the protocols
+            # Also we dont store the Categorizer because the user can retrain
+            # with its own data.
+            dataset.proto = dataset.proto.str.lower()
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*tcp.*$)", "0", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*udp.*$)", "1", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*icmp.*$)", "2", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*icmp-ipv6.*$)", "3", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*arp.*$)", "4", regex=True
+            )
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
+                dataset.proto,
+                dataset.dport,
+                dataset.sport,
+                dataset.dur,
+                dataset.pkts,
+                dataset.spkts,
+                dataset.allbytes,
+                dataset.sbytes,
+                dataset.state,
+            ]
+            for field in fields_to_convert_to_float:
+                try:
+                    field = field.astype("float64")
+                except (ValueError, AttributeError):
+                    pass
+
+            return dataset
+        except Exception:
+            # Stop the timer
+            self.print("Error in process_features()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def process_training_flows(self, last_number_of_flows_when_trained):
+        """
+        Process only the new flows in the DB since the last training.
+        Store the pandas df in self.flows
+        """
+        try:
+            # Ensure the index is an integer
+            if last_number_of_flows_when_trained is None:
+                last_number_of_flows_when_trained = 0
+            else:
+                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+
+            # We get all the flows so far
+            flows = self.db.get_all_flows()
+            # Only process new flows since last training
+            new_flows = flows[last_number_of_flows_when_trained:]
+
+            # Check how many **different** labels are in the DB
+            labels = self.db.get_labels()
+            if len(labels) == 1:
+                # Insert fake flows for both classes if needed
+                new_flows.append(
+                    {
+                        "starttime": 1594417039.029793,
+                        "dur": "1.9424750804901123",
+                        "saddr": "10.7.10.101",
+                        "sport": "49733",
+                        "daddr": "40.70.224.145",
+                        "dport": "443",
+                        "proto": "tcp",
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
+                        "sbytes": 25517,
+                        "dbytes": 17247,
+                        "appproto": "ssl",
+                        "ground_truth_label": "Malicious",
+                        "module_labels": {
+                            "flowalerts-long-connection": "Malicious"
+                        },
+                    }
+                )
+                new_flows.append(
+                    {
+                        "starttime": 1382355032.706468,
+                        "dur": "10.896695",
+                        "saddr": "147.32.83.52",
+                        "sport": "47956",
+                        "daddr": "80.242.138.72",
+                        "dport": "80",
+                        "proto": "tcp",
+                        "state": "SF",
+                        "spkts": 1,
+                        "dpkts": 0,
+                        "sbytes": 100,
+                        "dbytes": 67596,
+                        "appproto": "http",
+                        "ground_truth_label": "Benign",
+                        "module_labels": {
+                            "flowalerts-long-connection": "Benign"
+                        },
+                    }
+                )
+
+            # Convert to pandas df
+            df_flows = pd.DataFrame(new_flows)
+
+            # Process features
+            df_flows = self.process_features(df_flows)
+
+            # Update the flow to the processed version
+            self.flows = df_flows
+        except Exception:
+            self.print("Error in process_flows()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def process_flow(self, flow_to_process: dict):
+        """
+        Process one flow. Only used during detection in testing
+        returns the pandas df with the processed flow
+        """
+        try:
+            # Convert the flow to a pandas dataframe
+            raw_flow = pd.DataFrame(flow_to_process, index=[0])
+            dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
+            # Update the flow to the processed version
+            return dflow
+        except Exception:
+            # Stop the timer
+            self.print("Error in process_flow()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def detect(self, x_flow) -> Optional[numpy.ndarray]:
+        """
+        Detects the given flow with the current model stored
+        and returns the predection array
+        """
+        try:
+            # clean the flow
+            fields_to_drop = [
+                "label",
+                "module_labels",
+                "uid",
+                "history",
+                "dir_",
+                "endtime",
+                "flow_source",
+                "ground_truth_label",
+                "detailed_ground_truth_label",
+            ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
+            for field in fields_to_drop:
+                try:
+                    x_flow = x_flow.drop(field, axis=1)
+                except (KeyError, ValueError):
+                    pass
+            # Scale the flow
+            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
+            pred: numpy.ndarray = self.clf.predict(x_flow)
+            return pred
+        except Exception as e:
+            self.print(
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+            )
+            self.print(traceback.format_exc(), 0, 1)
+
+    def store_model(self):
+        """
+        Store the trained model on disk
+        """
+        self.print("Storing the trained model and scaler on disk.", 0, 2)
+        with open(self.model_path, "wb") as f:
+            data = pickle.dumps(self.clf)
+            f.write(data)
+        with open(self.scaler_path, "wb") as g:
+            data = pickle.dumps(self.scaler)
+            g.write(data)
+
+    def read_model(self):
+        """
+        Read the trained model from disk
+        """
+        try:
+            self.print("Reading the trained model from disk.", 0, 2)
+            with open(self.model_path, "rb") as f:
+                self.clf = pickle.load(f)
+            self.print("Reading the trained scaler from disk.", 0, 2)
+            with open(self.scaler_path, "rb") as g:
+                self.scaler = pickle.load(g)
+        except FileNotFoundError:
+            # If there is no model, create one empty
+            self.print(
+                "There was no model. " "Creating a new empty model.", 0, 2
+            )
+            self.clf = SGDClassifier(
+                warm_start=True, loss="hinge", penalty="l1"
+            )
+        except EOFError:
+            self.print(
+                "Error reading model from disk. "
+                "Creating a new empty model.",
+                0,
+                2,
+            )
+            self.clf = SGDClassifier(
+                warm_start=True, loss="hinge", penalty="l1"
+            )
+
+    def set_evidence_malicious_flow(self, flow: dict, twid: str):
+        confidence: float = 0.1
+        description = (
+            f"Flow with malicious characteristics by ML. Src IP"
+            f" {flow['saddr']}:{flow['sport']} to "
+            f"{flow['daddr']}:{flow['dport']}"
+        )
+        twid_number = int(twid.replace("timewindow", ""))
+        evidence: Evidence = Evidence(
+            evidence_type=EvidenceType.MALICIOUS_FLOW,
+            attacker=Attacker(
+                direction=Direction.SRC,
+                ioc_type=IoCType.IP,
+                value=flow["saddr"],
+            ),
+            victim=Victim(
+                direction=Direction.DST,
+                ioc_type=IoCType.IP,
+                value=flow["daddr"],
+            ),
+            threat_level=ThreatLevel.LOW,
+            confidence=confidence,
+            description=description,
+            profile=ProfileID(ip=flow["saddr"]),
+            timewindow=TimeWindow(twid_number),
+            uid=[flow["uid"]],
+            timestamp=flow["starttime"],
+            method=Method.AI,
+            src_port=flow["sport"],
+            dst_port=flow["dport"],
+        )
+
+        self.db.set_evidence(evidence)
+
+    def shutdown_gracefully(self):
+        # Confirm that the module is done processing
+        if self.mode == "train":
+            self.store_model()
+
+    def pre_main(self):
+        utils.drop_root_privs()
+        # Load the model
+        self.read_model()
+
+    def main(self):
+        if msg := self.get_msg("new_flow"):
+            # When a new flow arrives
+            msg = json.loads(msg["data"])
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
+            self.flow = msg["flow"]
+            # These following extra fields are expected in testing. update the original
+            # flow dict to have them
+            self.flow.update(
+                {
+                    "state": msg["interpreted_state"],
+                    "label": msg["label"],
+                    "module_labels": msg["module_labels"],
+                }
+            )
+
+            if self.mode == "train":
+                # We are training
+
+                # Is the amount in the DB of labels enough to retrain?
+                # Use labeled flows
+                labels = self.db.get_labels()
+                sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
+                if (
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
+                ):
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows(self.last_number_of_flows_when_trained)
+                        # Train an algorithm
+                        self.train(sum_labeled_flows)
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
+            elif self.mode == "test":
+                # We are testing, which means using the model to detect
+                processed_flow = self.process_flow(self.flow)
+
+                # After processing the flow, it may happen that we
+                # delete icmp/arp/etc so the dataframe can be empty
+                if processed_flow is not None and not processed_flow.empty:
+                    # Predict
+                    pred: numpy.ndarray = self.detect(processed_flow)
+                    if not pred:
+                        # an error occurred
+                        return
+
+                    label = self.flow["label"]
+                    if label and label != "unknown" and label != pred[0]:
+                        # If the user specified a label in test mode,
+                        # and the label is diff from the prediction,
+                        # print in debug mode
+                        self.print(
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} ->'
+                            f' {self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
+                            0,
+                            3,
+                        )
+                    if pred[0] == "Malicious":
+                        # Generate an alert
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
+                        self.print(
+                            f"Prediction {pred[0]} for label {label}"
+                            f' flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} -> '
+                            f'{self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
+                            0,
+                            2,
+                        )
\ No newline at end of file

From 1fcb086b1a756442a338e39e63634b1c95402d21 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:31:43 +0000
Subject: [PATCH 109/455] Fix plot test

---
 modules/flowmldetection/plot_testing_performance.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 1b4152c6eb..977a68b2d5 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")
@@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
         
         # Avoid log(0), so set the minimum limit a little higher than zero
         if min_val == 0:
-            min_val = 1e-8  # Avoid zero values on the logarithmic scale
+            min_val = 1e-4  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From 7b18a530e0525f810109cc4ea78138707a588d24 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:50:33 +0000
Subject: [PATCH 110/455] Add testing code to evaluate performance. It is
 optional with a varible

---
 modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++-------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 5e4e9aa462..b17a1baaf0 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -526,36 +526,21 @@ def main(self):
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
-
                 # After processing the flow, it may happen that we
                 # delete icmp/arp/etc so the dataframe can be empty
                 if processed_flow is not None and not processed_flow.empty:
+                    original_label = processed_flow["ground_truth_label"].iloc[0]
                     # Predict
                     pred: numpy.ndarray = self.detect(processed_flow)
                     if not pred:
                         # an error occurred
                         return
 
-                    label = self.flow["label"]
-                    if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
-                        self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            3,
-                        )
                     if pred[0] == "Malicious":
                         # Generate an alert
                         self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
-                            f"Prediction {pred[0]} for label {label}"
+                            f"Prediction {pred[0]} for label {original_label}"
                             f' flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} -> '
                             f'{self.flow["daddr"]}:'
@@ -563,4 +548,43 @@ def main(self):
                             f'{self.flow["proto"]}',
                             0,
                             2,
-                        )
\ No newline at end of file
+                        )
+
+                    # So you can disable this code easily. Since it is used only for evaluating a testing
+                    log_testing_data = True
+                    if log_testing_data:
+                        # Initialize counters if not already done
+                        if not hasattr(self, 'tp'):
+                            self.tp = 0
+                        if not hasattr(self, 'tn'):
+                            self.tn = 0
+                        if not hasattr(self, 'fp'):
+                            self.fp = 0
+                        if not hasattr(self, 'fn'):
+                            self.fn = 0
+
+
+                        # Update counters based on predictions and labels
+                        if pred[0] == "Malicious" and original_label == "Malicious":
+                            self.tp += 1
+                        elif pred[0] == "Benign" and original_label == "Benign":
+                            self.tn += 1
+                        elif pred[0] == "Malicious" and original_label == "Benign":
+                            self.fp += 1
+                        elif pred[0] == "Benign" and original_label == "Malicious":
+                            self.fn += 1
+
+                        testing_log_path = "./modules/flowmldetection/testing_performance.log"
+                        try:
+                            with open(testing_log_path, "a") as log_file:
+                                log_file.write("Testing Performance Log Initialized\n")
+                                # Log the testing performance metrics
+                                log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n")
+
+                                # Log the original flow for false positives and false negatives
+                                if pred[0] == "Malicious" and original_label == "Benign":
+                                    log_file.write(f"False Positive Flow: {self.flow}\n")
+                                elif pred[0] == "Benign" and original_label == "Malicious":
+                                    log_file.write(f"False Negative Flow: {self.flow}\n")
+                        except Exception as e:
+                            self.print(f"Error initializing testing performance log: {e}", 0, 1)
\ No newline at end of file

From 4e8cbda03b1b8c7357c818f241b53b67afc86567 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 19:04:00 +0000
Subject: [PATCH 111/455] Fix plots

---
 .../plot_testing_performance.py               | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 977a68b2d5..6865415cdf 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 import sys
 import numpy as np
+import argparse
 
 def process_file(file_path):
     # Initialize the counters for the values
@@ -49,7 +50,7 @@ def process_file(file_path):
     
     return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
 
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number):
     # Separate the values into two groups based on their proximity to 0 or 1
     close_to_0 = {
         'FPR': [], 'FNR': []
@@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0 (linear scale)
-    plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
+    plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False)
 
     # Print the final values
-    print("\nFinal Metric Values:")
+    print("\nFinal Metric Values for Experiment", experiment_number)
     print(f"Final FPR: {FPR_values[-1]:.4f}")
     print(f"Final FNR: {FNR_values[-1]:.4f}")
     print(f"Final TNR: {TNR_values[-1]:.4f}")
@@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     print(f"Final MCC: {MCC_values[-1]:.4f}")
     print(f"Final Recall: {recall_values[-1]:.4f}")
 
-def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
+def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False):
     plt.figure(figsize=(12, 8))
     
     # Only plot the metrics that exist in the dictionary
@@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
             min_val = 1e-4  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
 
+    # Add the experiment number to the plot title
     plt.xlabel('Index')
     plt.ylabel('Metric Value')
-    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
+    plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time')
     plt.legend()
     
     # Save the plot
@@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     plt.close()
 
 def main():
-    if len(sys.argv) != 2:
-        print("Usage: python script.py <file_path>")
-        sys.exit(1)
+    # Set up argument parsing
+    parser = argparse.ArgumentParser(description='Plot testing performance metrics.')
+    parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file')
+    parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number')
+
+    args = parser.parse_args()
     
-    file_path = sys.argv[1]
+    file_path = args.file
+    experiment_number = args.experiment
     
     FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number)
 
 if __name__ == "__main__":
     main()

From d4cc5625cb18e8207c7aa6e1a42a5a88e3d57134 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 19:14:51 +0000
Subject: [PATCH 112/455] Fix train plot

---
 .../flowmldetection/plot_train_performance.py | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 80e13e9515..244df13d28 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -5,7 +5,7 @@
 import argparse
 import os
 
-def plot_log_data(file_path):
+def plot_log_data(file_path, experiment_number):
     # Read the log data from the file
     with open(file_path, 'r') as file:
         log_data = file.read()
@@ -28,7 +28,8 @@ def plot_log_data(file_path):
 
     # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    plot_file = os.path.join(dir_name, 'performance_metrics_training.png')
+    # Append experiment number to the filename
+    plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png')
 
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))
@@ -55,18 +56,18 @@ def plot_log_data(file_path):
     for i, value in enumerate(df["Total labels"]):
         ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
 
-    # Adding title and legend
-    plt.title('Training performance')
+    # Adding title and legend with experiment number in title
+    plt.title(f'Training performance - Experiment {experiment_number}')
     fig.tight_layout()
 
     # Move both legends further to the right
-    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
-    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1)
 
     # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.7)
+    plt.subplots_adjust(right=0.75)
 
-    # Save plot to the same folder as the log file
+    # Save plot to the same folder as the log file with experiment number in filename
     plt.savefig(plot_file)
 
     # Display the plot
@@ -75,13 +76,14 @@ def plot_log_data(file_path):
 def main():
     # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
-    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
+    parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file")
+    parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename")
     
     # Handle -h / --help
     args = parser.parse_args()
 
     # Call the function to process the log file
-    plot_log_data(args.log_file)
+    plot_log_data(args.file, args.experiment)
 
 if __name__ == "__main__":
     main()

From d3b0190e39beb89cffaf8ad51a2cec0d787f7920 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 21:14:48 +0000
Subject: [PATCH 113/455] Fix plots

---
 .../flowmldetection/plot_train_performance.py | 122 ++++++++++--------
 1 file changed, 71 insertions(+), 51 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 244df13d28..5212dfeeaf 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -4,85 +4,105 @@
 import sys
 import argparse
 import os
+import matplotlib.ticker as ticker
 
 def plot_log_data(file_path, experiment_number):
     # Read the log data from the file
     with open(file_path, 'r') as file:
         log_data = file.read()
 
-    # Define regex pattern to extract relevant data from each line
-    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
+    # Regex pattern for the new log format
+    pattern = (
+        r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: "
+        r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), "
+        r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\."
+    )
 
     # Parse the log file
     data = re.findall(pattern, log_data)
 
     # Convert data to a DataFrame
-    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
+    columns = [
+        "Total labels", "Background", "Benign", "Malicious",
+        "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"
+    ]
+    df = pd.DataFrame(data, columns=columns)
     df = df.astype({
+        "Total labels": float,
         "Background": int,
         "Benign": int,
         "Malicious": int,
-        "Total labels": float,
-        "Score": float
+        "FPR": float,
+        "TNR": float,
+        "TPR": float,
+        "FNR": float,
+        "F1": float,
+        "Precision": float,
+        "Accuracy": float,
+        "MCC": float,
+        "Recall": float,
     })
 
-    # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    # Append experiment number to the filename
-    plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png')
-
-    # Plotting the values
-    fig, ax1 = plt.subplots(figsize=(10, 6))
 
-    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
-    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
+    # --- Plot 1: Number of labels (linear scale, no total labels) ---
+    fig1, ax1 = plt.subplots(figsize=(10, 6))
+    ax1.plot(df.index, df["Background"], label="Background", color='black')
+    ax1.plot(df.index, df["Benign"], label="Benign", color='cyan')
+    ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta')
     ax1.set_xlabel('Index')
-    ax1.set_ylabel('Score', color='tab:blue')
-    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
-    ax1.tick_params(axis='y', labelcolor='tab:blue')
-
-    # Create the second y-axis for the Background, Benign, Malicious
-    ax2 = ax1.twinx()
-    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
-    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
-    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
-    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
-    
-    # Set appropriate scale for right y-axis based on the data
-    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
-    ax2.tick_params(axis='y', labelcolor='tab:red')
-
-    # Annotating Total labels as text on the plot
-    for i, value in enumerate(df["Total labels"]):
-        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
-
-    # Adding title and legend with experiment number in title
-    plt.title(f'Training performance - Experiment {experiment_number}')
-    fig.tight_layout()
+    ax1.set_ylabel('Label Counts')
+    # No log scale here
+    ax1.set_title(f'Label Counts - Experiment {experiment_number}')
+    ax1.legend()
+    ax1.yaxis.set_major_locator(ticker.MaxNLocator(70))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png'))
+
+    # --- Plot 2: FNR and FPR (log scale) ---
+    fig2, ax2 = plt.subplots(figsize=(10, 6))
+    ax2.plot(df.index, df["FNR"], label="FNR", color='red')
+    ax2.plot(df.index, df["FPR"], label="FPR", color='blue')
+    ax2.set_xlabel('Index')
+    ax2.set_ylabel('Rate')
+    ax2.set_yscale('log')
+    ax2.set_title(f'FNR and FPR - Experiment {experiment_number}')
+    ax2.legend()
+    ax2.yaxis.set_major_locator(ticker.MaxNLocator(100))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png'))
+
+    # --- Plot 3: Other metrics (log scale) ---
+    fig3, ax3 = plt.subplots(figsize=(12, 7))
+    metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"]
+    colors_rest = [
+        'tab:blue', 'tab:green', 'tab:purple', 'tab:brown',
+        'tab:gray', 'tab:pink', 'tab:olive'
+    ]
+    for metric, color in zip(metrics_rest, colors_rest):
+        ax3.plot(df.index, df[metric], label=metric, color=color)
+    ax3.set_xlabel('Index')
+    ax3.set_ylabel('Metric Value')
+    ax3.set_yscale('log')
+    ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}')
+    ax3.legend()
+    ax3.yaxis.set_major_locator(ticker.MaxNLocator(50))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png'))
 
-    # Move both legends further to the right
-    ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1)
-    ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1)
-
-    # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.75)
-
-    # Save plot to the same folder as the log file with experiment number in filename
-    plt.savefig(plot_file)
-
-    # Display the plot
     plt.show()
 
+    # --- Print final values in terminal ---
+    print("\nFinal values at last training step:")
+    for col in ["Total labels", "Background", "Benign", "Malicious",
+                "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]:
+        print(f"{col}: {df[col].iloc[-1]}")
+
 def main():
-    # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
     parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file")
     parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename")
-    
-    # Handle -h / --help
     args = parser.parse_args()
-
-    # Call the function to process the log file
     plot_log_data(args.file, args.experiment)
 
 if __name__ == "__main__":

From aa8331fa4417cf3912a623528607f7480edcb796 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 21:16:01 +0000
Subject: [PATCH 114/455] Add performance metrics to the training evaluation

---
 modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++-----
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index b17a1baaf0..2c60cd4034 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,7 +10,16 @@
 import json
 import traceback
 import warnings
-import os
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.metrics import (
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    accuracy_score,
+    matthews_corrcoef,
+    recall_score,
+)
+
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -86,21 +95,21 @@ def write_to_training_log(self, message: str):
         except Exception as e:
             self.print(f"Error writing to training log: {e}", 0, 1)
 
-    def train(self, sum_labeled_flows):
+    def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
+            # Create y_flow with the label
+            y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label)
             # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("ground_truth_label", axis=1)
             # Drop the detailed labels
             X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
             # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
 
-            # Normalize this batch of data so far. This can get progressivle slow
+            # Normalize this batch of data so far. This can get progressively slow
             X_flow = self.scaler.fit_transform(X_flow)
 
             # Count the number of labels of each type in this epoc
@@ -120,18 +129,43 @@ def train(self, sum_labeled_flows):
                 self.print("Error while calling clf.train()")
                 self.print(traceback.format_exc(), 0, 1)
 
-            # See score so far in training
-            score = self.clf.score(X_flow, y_flow)
+            # Predict on the training data
+            y_pred = self.clf.predict(X_flow)
 
-            #self.print(f"	Training Score: {score}", 1, 0)
-            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
+            # For metrics, let's focus on Malicious vs Benign (ignore Background)
+            mask = (y_flow == "Malicious") | (y_flow == "Benign")
+            y_true_bin = y_flow[mask]
+            y_pred_bin = y_pred[mask]
+
+            # Map to binary: Malicious=1, Benign=0
+            y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0)
+            y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0)
+
+            # Compute confusion matrix: tn, fp, fn, tp
+            tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0)
+
+            # Compute metrics
+            FPR = fp / (fp + tn) if (fp + tn) > 0 else 0
+            TNR = tn / (tn + fp) if (tn + fp) > 0 else 0
+            TPR = tp / (tp + fn) if (tp + fn) > 0 else 0
+            FNR = fn / (fn + tp) if (fn + tp) > 0 else 0
+            F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0)
+            PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0)
+            ACCU = accuracy_score(y_true_bin, y_pred_bin)
+            MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0
+            RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0)
 
             # Store the models on disk
             self.store_model()
 
             # Log training information
-            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
-            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
+            self.write_to_training_log(
+                f"Total labels: {sum_labeled_flows}, "
+                f"Background: {epoch_label_counts['Background']}. "
+                f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. "
+                f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, "
+                f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}."
+            )
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
@@ -520,7 +554,7 @@ def main(self):
                         # for pandas
                         self.process_training_flows(self.last_number_of_flows_when_trained)
                         # Train an algorithm
-                        self.train(sum_labeled_flows)
+                        self.train(sum_labeled_flows, self.last_number_of_flows_when_trained)
                         self.last_number_of_flows_when_trained = sum_labeled_flows
 
             elif self.mode == "test":

From bbd6e0a0e40db29a29481ac4839b4efa42252b34 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sun, 4 May 2025 12:50:46 +0000
Subject: [PATCH 115/455] Fix experiment names

---
 modules/flowmldetection/plot_train_performance.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 5212dfeeaf..304f0f4ead 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number):
     ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta')
     ax1.set_xlabel('Index')
     ax1.set_ylabel('Label Counts')
-    # No log scale here
     ax1.set_title(f'Label Counts - Experiment {experiment_number}')
     ax1.legend()
     ax1.yaxis.set_major_locator(ticker.MaxNLocator(70))
+    ax1.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png'))
 
     # --- Plot 2: FNR and FPR (log scale) ---
     fig2, ax2 = plt.subplots(figsize=(10, 6))
@@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number):
     ax2.set_title(f'FNR and FPR - Experiment {experiment_number}')
     ax2.legend()
     ax2.yaxis.set_major_locator(ticker.MaxNLocator(100))
+    ax2.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png'))
 
     # --- Plot 3: Other metrics (log scale) ---
     fig3, ax3 = plt.subplots(figsize=(12, 7))
@@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number):
     ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}')
     ax3.legend()
     ax3.yaxis.set_major_locator(ticker.MaxNLocator(50))
+    ax3.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png'))
 
     plt.show()
 

From 416bc48fd70e9f92b8a4cf4a192ae9f05a2ce4fc Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 116/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++-
 1 file changed, 150 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e44ac83f4d..16b67e9038 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -120,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -132,7 +268,7 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # For now, discard the ports
+            # For now, discard these
             to_drop = [
                 "appproto",
                 "daddr",
@@ -155,15 +291,25 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # So transform here
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
+
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
-                r"(^.*NotEstablished.*$)", "0", regex=True
+                r"(^.*Not Established.*$)", "0", regex=True
             )
             dataset.state = dataset.state.str.replace(
                 r"(^.*Established.*$)", "1", regex=True
             )
-            # Convert proto to categorical. For now we only have few states,
-            # so we can hardcode...
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
             # We dont use the data to create categories because in testing mode
             # we dont see all the protocols
             # Also we dont store the Categorizer because the user can retrain

From 82ff65455c8cea8514ef0285aaf98846a34eb8e8 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 117/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 3e4bf3fbb9df71feb63e125ddae50e54b6a375f1 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 118/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 45710b72db50551053c09ed71059fa5d1bfcf712 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 119/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 16b67e9038..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -291,12 +156,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index e8ca3aaf62..b4b2128d3d 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 014ee473003b36f3680b7f40aa60dd9c7d4ae759 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 120/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From d5f6330c3e6bdb0a8f81d0f1349f927bfda8636e Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 121/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 2ad9ccb25fcd46b9da91c72f3400de5ae3ec364e Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 122/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 88cb54b68e0da3088fc76cc8b351c5653aa93857 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 123/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 13dfd28bb8915c4c61798efa0051daaa7ee9daa9 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 124/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From d2a5935f05c2aee6c5cb4ae7285151258f836e13 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 125/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 60f0b286cb0b1172b01416bd66a45a117fa55577 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 126/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 6a05fa3efdb4254380541b7b7c32c4c02f829cf7 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 127/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 77c9a10ce27447cf87d9d4720132554bc1cb9f5c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 128/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 6e4841ffcffad999d14dcbc1354dbccc8f2cc546 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 129/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 0d8414e9937ce34aac0b6fbb7fe328c7d207ead6 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:23:29 +0100
Subject: [PATCH 130/455] Re add function that alya added

---
 slips_files/core/database/database_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index b4b2128d3d..e8ca3aaf62 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
+    def get_final_state_from_flags(self, *args, **kwargs):
+        return self.rdb.get_final_state_from_flags(*args, **kwargs)
+
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 97f86a5709fad2144a97f643b8381cd48f86b148 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:25:03 +0100
Subject: [PATCH 131/455] delete sys

---
 modules/flowmldetection/flowmldetection.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..c06755a599 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,7 +10,6 @@
 import json
 import traceback
 import warnings
-import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils

From c468366fca89d0fe5d6d00ec8c660f62ed616b46 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:27:23 +0100
Subject: [PATCH 132/455] Delete file that was deleted from develop

---
 slips_files/common/state_handler.py | 170 ----------------------------
 1 file changed, 170 deletions(-)
 delete mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
deleted file mode 100644
index d0a05115bd..0000000000
--- a/slips_files/common/state_handler.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Optional
-
-
-def interpret_suricata_states(state) -> Optional[str]:
-    """
-    There are different states in which a flow can be.
-    Suricata distinguishes three flow-states for TCP and two for
-     UDP. For TCP,
-    these are: New, Established and Closed,for UDP only new and
-    established.
-    For each of these states Suricata can employ different timeouts.
-    """
-    if "new" in state or "established" in state:
-        return "Established"
-    elif "closed" in state:
-        return "Not Established"
-
-
-def interpret_zeek_states(state) -> Optional[str]:
-    # We have varius type of states depending on the type of flow.
-    # For Zeek
-    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-        return "Not Established"
-    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-        return "Established"
-
-
-def interpret_argus_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
-    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-        """
-        Examples:
-        SA_SA
-        SR_SA
-        FSRA_SA
-        SPA_SPA
-        SRA_SPA
-        FSA_FSA
-        FSA_FSPA
-        SAEC_SPA
-        SRPA_SPA
-        FSPA_SPA
-        FSRPA_SPA
-        FSPA_FSPA
-        FSRA_FSPA
-        SRAEC_SPA
-        FSPA_FSRPA
-        FSAEC_FSPA
-        FSRPA_FSPA
-        SRPAEC_SPA
-        FSPAEC_FSPA
-        SRPAEC_FSRPA
-        """
-        return "Established"
-    elif "PA" in pre and "PA" in suf:
-        # Tipical flow that was reported in the middle
-        """
-        Examples:
-        PA_PA
-        FPA_FPA
-        """
-        return "Established"
-    elif "ECO" in pre:
-        return "ICMP Echo"
-    elif "ECR" in pre:
-        return "ICMP Reply"
-    elif "URH" in pre:
-        return "ICMP Host Unreachable"
-    elif "URP" in pre:
-        return "ICMP Port Unreachable"
-    else:
-        """
-        Examples:
-        S_RA
-        S_R
-        A_R
-        S_SA
-        SR_SA
-        FA_FA
-        SR_RA
-        SEC_RA
-        """
-        return "Not Established"
-
-
-def interpret_tcp_states(state, pkts) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "EST" in pre:
-        # TCP
-        return "Established"
-    elif "RST" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are reseted when finished and therefore are
-        # established
-        # It can happen that is reseted being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    elif "FIN" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are finished with FIN when finished and
-        # therefore are established
-        # It can happen that is finished being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    else:
-        """
-        Examples:
-        S_
-        FA_
-        PA_
-        FSA_
-        SEC_
-        SRPA_
-        """
-        return "Not Established"
-
-
-def interpret_udp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "CON" in pre:
-        # UDP
-        return "Established"
-    elif "INT" in pre:
-        # UDP trying to connect, NOT preciselly not established but also
-        # NOT 'Established'. So we considered not established because there
-        # is no confirmation of what happened.
-        return "Not Established"
-
-
-def interpret_icmp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "ECO" in pre:
-        # ICMP
-        return "Established"
-    elif "UNK" in pre:
-        # ICMP6 unknown upper layer
-        return "Established"
-
-
-def get_final_state_from_flags(state, pkts) -> str:
-    """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
-    """
-
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
-
-    return "Not Established"

From 952c3b29c7c6637bd78b66c1c2fc9a333f72a5d0 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:32:01 +0100
Subject: [PATCH 133/455] Flowmldetection. Fix missing db reference

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c06755a599..87e07c7592 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -160,7 +160,7 @@ def process_features(self, dataset):
             # 'Not Established', it is still 'S0' and others
             # So transform here
             dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
+                lambda row: self.db.get_final_state_from_flags(
                     row["state"], row["pkts"]
                 ),
                 axis=1,

From e7af6dc61e06a5759e5c5ddfcfe3ffadcdf67fb6 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:08 +0100
Subject: [PATCH 134/455] Fix the training of flows with ML in new version

---
 modules/flowmldetection/flowmldetection.py | 144 +++++++++++----------
 1 file changed, 77 insertions(+), 67 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 87e07c7592..e91495d649 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -55,8 +55,12 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new lables needed to trigger the train
-        self.minimum_lables_to_retrain = 50
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained
+        self.last_number_of_flows_when_trained = 0
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
@@ -67,26 +71,25 @@ def init(self):
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Process the labels to have only Normal and Malware
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*ormal.*$)", "Normal", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alware.*$)", "Malware", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alicious.*$)", "Malware", regex=True
-            )
+            # Get the flows from the DB
+            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
+            # Convert to pandas df
+            # self.flows = pd.DataFrame(self.flows)
+            # Process the features
+            # X_flow = self.process_features(self.flows)
 
-            # Separate
-            y_flow = self.flows["label"]
+            # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("label", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.label)
+            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -95,7 +98,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malware", "Normal"]
+                    X_flow, y_flow, classes=["Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -118,7 +121,7 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train()", 0, 1)
+            self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -144,9 +147,7 @@ def process_features(self, dataset):
                 "history",
                 "uid",
                 "dir_",
-                "dbytes",
                 "endtime",
-                "bytes",
                 "flow_source",
             ]
             for field in to_drop:
@@ -161,11 +162,10 @@ def process_features(self, dataset):
             # So transform here
             dataset["state"] = dataset.apply(
                 lambda row: self.db.get_final_state_from_flags(
-                    row["state"], row["pkts"]
+                    row["state"], (row["spkts"] + row["dpkts"])
                 ),
                 axis=1,
             )
-            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -199,7 +199,11 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            fields_to_convert_to_flow = [
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
                 dataset.proto,
                 dataset.dport,
                 dataset.sport,
@@ -210,10 +214,10 @@ def process_features(self, dataset):
                 dataset.sbytes,
                 dataset.state,
             ]
-            for field in fields_to_convert_to_flow:
+            for field in fields_to_convert_to_float:
                 try:
                     field = field.astype("float64")
-                except ValueError:
+                except (ValueError, AttributeError):
                     pass
 
             return dataset
@@ -222,9 +226,9 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flows(self):
+    def process_training_flows(self):
         """
-        Process all the flwos in the DB
+        Process all the flows in the DB
         Store the pandas df in self.flows
         """
         try:
@@ -240,44 +244,48 @@ def process_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
+
+                # These flows should be in the same format as the ones in the DB. 
+                # Which means the satate is still SF, S0, etc.
                 flows.append(
                     {
-                        "ts": 1594417039.029793,
+                        "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 42764,
-                        "spkts": 37,
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
                         "sbytes": 25517,
+                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malware",
+                        "label": "Malicious",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malware"
+                            "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "ts": 1382355032.706468,
+                        "starttime": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 67696,
+                        "state": "SF",
                         "spkts": 1,
+                        "dpkts": 0,
                         "sbytes": 100,
+                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Normal",
+                        "label": "Benign",
                         "module_labels": {
-                            "flowalerts-long-connection": "Normal"
+                            "flowalerts-long-connection": "Benign"
                         },
                     }
                 )
@@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
         and returns the predection array
         """
         try:
-            given_x_flow = x_flow
             # clean the flow
             fields_to_drop = [
                 "label",
@@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "uid",
                 "history",
                 "dir_",
-                "dbytes",
-                "dpkts",
                 "endtime",
-                "bytes",
                 "flow_source",
                 "ground_truth_label",  # todo now we can use them
                 "detailed_ground_truth_label",
@@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             return pred
         except Exception as e:
             self.print(
-                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
             )
             self.print(traceback.format_exc(), 0, 1)
 
@@ -437,18 +441,16 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
+            # When a new flow arrives
             msg = json.loads(msg["data"])
-            twid = msg["twid"]
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
             self.flow = msg["flow"]
-            # these fields are expected in testing. update the original
+            # These following extra fields are expected in testing. update the original
             # flow dict to have them
             self.flow.update(
                 {
-                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
-                    # the flow["state"] is the origstate, we dont need that here
-                    # we need the interpreted state
                     "state": msg["interpreted_state"],
-                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
                     "label": msg["label"],
                     "module_labels": msg["module_labels"],
                 }
@@ -461,23 +463,31 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_lables_to_retrain
-                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain'
-                    # amount of labels
-                    # So for example we retrain every 100 labels and only when
-                    # we have at least 100 labels
-                    self.print(
-                        f"Training the model with the last group of "
-                        f"flows and labels. Total flows: {sum_labeled_flows}."
-                    )
-                    # Process all flows in the DB and make them ready
-                    # for pandas
-                    self.process_flows()
-                    # Train an algorithm
-                    self.train()
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows()
+                        # Train an algorithm
+                        self.train()
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
@@ -497,8 +507,8 @@ def main(self):
                         # and the label is diff from the prediction,
                         # print in debug mode
                         self.print(
-                            f"Report Prediction {pred[0]} for label"
-                            f' {label} flow {self.flow["saddr"]}:'
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} ->'
                             f' {self.flow["daddr"]}:'
                             f'{self.flow["dport"]}/'
@@ -506,9 +516,9 @@ def main(self):
                             0,
                             3,
                         )
-                    if pred[0] == "Malware":
+                    if pred[0] == "Malicious":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, twid)
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
                             f"Prediction {pred[0]} for label {label}"
                             f' flow {self.flow["saddr"]}:'

From 1dbde99abda8734e06222b1149806e1b626d2602 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:29 +0100
Subject: [PATCH 135/455] Fix the profiler handler for cases of nan in state

---
 .../core/database/redis_db/profile_handler.py     | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index 0489372cdc..1ea7644648 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -379,7 +379,12 @@ def get_final_state_from_flags(self, state, pkts):
         We receive the pakets to distinguish some Reset connections
         """
         try:
-            pre = state.split("_")[0]
+            # In some flows the state is a nan
+            try:
+                pre = state.split("_")[0]
+            except AttributeError:
+                pre = ''
+
             try:
                 # Try suricata states
                 """
@@ -401,7 +406,11 @@ def get_final_state_from_flags(self, state, pkts):
                     return "Established"
 
                 # For Argus
-                suf = state.split("_")[1]
+            # In some flows the state is a nan
+                try:
+                    suf = state.split("_")[1]
+                except AttributeError:
+                    suf = ''
                 if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
                     """
                     Examples:
@@ -502,7 +511,7 @@ def get_final_state_from_flags(self, state, pkts):
         except Exception:
             exception_line = sys.exc_info()[2].tb_lineno
             self.print(
-                f"Error in getFinalStateFromFlags() in database.py line {exception_line}",
+                f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}",
                 0,
                 1,
             )

From 2c3a9eb2363c14e89d08bda2d8f7698c41f148a3 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:20:42 +0100
Subject: [PATCH 136/455] slips.yaml. Update to have correct labels. By default
 test. Defaul training lbel is benign

---
 config/slips.yaml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/config/slips.yaml b/config/slips.yaml
index f7089b41af..8736eaf511 100644
--- a/config/slips.yaml
+++ b/config/slips.yaml
@@ -106,13 +106,12 @@ parameters:
   deletePrevdb: true
 
   # Set the label for all the flows that are being read.
-  # For now only normal and malware directly. No option for setting labels
-  # with a filter
+  # For now only Benign and Malicious (Capitalized)
   # The purpose is to be used in the training of ML models and to output
   # flows with labels for other tools.
-  # label: malicious
-  # label: unknown
-  label: normal
+  # label: Malicious
+  # label: Benign
+  label: Benign
   # If Zeek files are rotated or not to avoid running out of disk.
   # Zeek rotation is enabled by default when using an interface,
   # which means Slips will delete all Zeek log files after 1 day

From 38bdc30b059f670dde6817a575504f7f308f9ad0 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:21:21 +0100
Subject: [PATCH 137/455] First ipython to tst ML flow related models

---
 modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb

diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb
new file mode 100644
index 0000000000..d726cd2805
--- /dev/null
+++ b/modules/flowmldetection/flowmlanalysis.ipynb
@@ -0,0 +1,76 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis of Flows with Machine Learning for Slips"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis of a fixed list of flows to try techniques and find parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy\n",
+    "from sklearn.linear_model import SGDClassifier\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "import pickle\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "import traceback\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "slips-new",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 8b2e850f150389ad00d4c10d65abd7c94f5b58fb Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:22:38 +0100
Subject: [PATCH 138/455] flowml. If the dataset has one flow and that is
 deleted, then return empty fast.

---
 modules/flowmldetection/flowmldetection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e91495d649..58b4ce1e4c 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -135,6 +135,11 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
             # For now, discard these
             to_drop = [
                 "appproto",

From b179fac4ade82c0d1716ad13428a606e25f4fae9 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:23:05 +0100
Subject: [PATCH 139/455] flowml. If the datasert is empty. Return none

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 58b4ce1e4c..4a4d46e376 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
             # Update the flow to the processed version
             return dflow
         except Exception:

From dd98ff1307fc64e517d3eff4a80301e6be8dd1e3 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:26:42 +0100
Subject: [PATCH 140/455] profile_handler. Small bug in how we handled the
 profiles, we were using 'in' instead of == for established. Some not
 established MAY not have been correctly captured

---
 slips_files/core/database/redis_db/profile_handler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index 1ea7644648..85fdec5a63 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -393,9 +393,10 @@ def get_final_state_from_flags(self, state, pkts):
                 these are: New, Established and Closed,for UDP only new and established.
                 For each of these states Suricata can employ different timeouts.
                 """
-                if "new" in state or "established" in state:
+                # This is controversial, but if we dont have a good state, we consider it not established for now
+                if "new" in state or state.lower() == "established":
                     return "Established"
-                elif "closed" in state:
+                elif "closed" in state or state.lower() == 'not established':
                     return "Not Established"
 
                 # We have varius type of states depending on the type of flow.
@@ -406,7 +407,6 @@ def get_final_state_from_flags(self, state, pkts):
                     return "Established"
 
                 # For Argus
-            # In some flows the state is a nan
                 try:
                     suf = state.split("_")[1]
                 except AttributeError:

From 4de77d6fcb7a8acf5e2a1510950e28a285084344 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:27:16 +0100
Subject: [PATCH 141/455] First new version of the model and scaler. Not good
 yet, but working.

---
 modules/flowmldetection/model.bin  | Bin 1073 -> 1090 bytes
 modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644
GIT binary patch
delta 411
zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice!
z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWp<G
zkd+9;IY10jlMJQvfpjSl7X$GWpm9B7@u?LBiIY2+gw>QXRJ|Emr(~!@tY-9P0&;YE
zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1
z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8
zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q
b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5

delta 380
zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ
z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o
zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF
za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN
zW*@<QMemXQ6rg!MT$y<#sYS`D1tsxQGBDi*^nz8VGl$HwGgBKKcpSi5_$D7__GNWU
Y$oltSvM7r(H=BW;k%^wsl+q+U0L_+r^#A|>

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644
GIT binary patch
literal 890
zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD<
z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*<Bn8N3NzN~*1k!?Osfi_}MXB+5
ziMgr8@tJw?Q+h=6N^=V;^^)_8QuT66b4oH3i;5B}r}Xf|7o{fW=M|R}l_r+}?dV|z
zE1c594ze4hWbzbmhSn+0j7d}4rvy#W@MiSpZk>|B)5DrlQdy7+(!%6#F{QHuBFo&v
zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n
zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ
zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~
zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E;
z<KX(aCF;No0f(>8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k<U{t7-@NM2t?O`b(oOJr
z^gz^M3Md#@$`XrkInNqs`NcqG+rqF4hjd*#=9SUs?C-J63^i@q?kIRi^+W#GZH^`;
z_s<+FuX0#b@Z7FWCEbCm^j(f1Uy0+U$5+2R=gD#Gag=$mqjtZ2f~w@S(sxacYyTMt
zX(&vBSPl-{RD9NRUiR6)W4EzGNqG>5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp
z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg
dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_

delta 525
zcmeyxHj9<Dfn{nx(?r(zNWQ%IwA93s(xTMj_{_ZcDc+nt{CV-kiMa(iK#}6qRG?Uo
zXkKY<L8V@Deo?AkZfQ<QW@1rMV&#+`w&LW(oW!Cjlc)5sf|UU2l#<GVRG`S@UPd*o
zK3xVdn9?>S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq
z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5m<hihBT2gkmB>hSAF{H=RwvmLf>
z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG
zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%Yf<K@j$
z%OkDN+J7s4Zlk~8nEkR&OOX||2OO>Q^>gH&y>a4SnPazubB5!cqc1YnCGT<q+XfD^
zRFHk>))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ
z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4<MO)ScbFH0>d&dkpP
QMw@}2k%^wsl+q+U0P2|NZU6uP


From 03e408119146041e40ad6c29370f1694eb1e40a8 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:06 +0100
Subject: [PATCH 142/455] model and scaler with 1 malicious and 1 benign

---
 modules/flowmldetection/model.bin  | Bin 1090 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644
GIT binary patch
delta 132
zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSY<wU!LGn&}gUNx$3tLYnGl@A&e4S}J
zBSY*!uI-OaTr&&~Tw#uuJJhD?&}O=|yn5R|`<q--?Rc;A9e69=F=gK7d-lipbZtG?
j>K$N!g2|oC+8oRs(l<p`Iv<$4hdGq_%F%|&A}o3UHQY43

delta 131
zcmV-}0DS-C2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
lK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162cym;jSD1TGrII8guq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index bfba4d107224e5e6e5a1e8c8f4d463b48131d111..758909b289238ff282b2e056a9b3e83768b8472a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi%8H3jhEB

delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC


From 6f11f45559c826cc26082eff8ee3c5cd8fb8435c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:27 +0100
Subject: [PATCH 143/455] cleaner jupyter

---
 modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4a4d46e376..d8e9ada27c 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "ground_truth_label",  # todo now we can use them
                 "detailed_ground_truth_label",
             ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)

From 17ccb096b61ea71a780e182a2bb0626985e4c755 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 22:26:27 +0100
Subject: [PATCH 144/455] New models after 3rd ttrain

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644
GIT binary patch
delta 99
zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~Cof{2$owSaz+^KPJplbX
BD~|vG

delta 99
zcmV-p0G$8i2;>N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q;
zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG
F1TN2UERg^J

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644
GIT binary patch
delta 43
zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G|lbQl%0g;md19^W6
B5HkP(

delta 43
zcmV+`0M!5b2KolDfdU!a4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G}lbQl%0g#gc19^VR
B5HSD%


From 38b1c790c8d5cf9d1ab84e1b827cf8a00d740e92 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 00:08:50 +0100
Subject: [PATCH 145/455] Models after 4th train

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644
GIT binary patch
delta 120
zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
as{<<t5IC<!D(NIZlimYX0{noJHUuueR5UOE

delta 120
zcmaFD@q}YTFmr_2vYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~;rz*~nYB5XJEU)ltaLsw
P`5to^Q^<kIW-NLDlQb|V

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 821344a0c69d116622b02e2a0daa1554cb5d308e..29df65342047c5a499ee3f8e602d1f47cb7e9fca 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8kA
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi>ty4FCWD


From b58ca823684868b27c4357999e55d97d2b75ad4a Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 08:28:59 +0100
Subject: [PATCH 146/455] Models of ml flow with the first good performance in
 small tests

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644
GIT binary patch
delta 121
zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1();
zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y<G(@KnDch(0mn<s!fm-ctwS<Lp0^
bs{<<tAstXx>?b6^limYW1McwvlQsk{8#y@u

delta 121
zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
bs{<<t5IC<!D(NIZlimYW1LXXGlQsk{_eeDq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 29df65342047c5a499ee3f8e602d1f47cb7e9fca..17115724b9536f6093f9d72f3b58a5c22c562a9a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK

delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T


From cdfd04f0667647f4d3d4a47bb56d7f6d7edc00d6 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 147/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 317 +++++++++++++--------
 1 file changed, 206 insertions(+), 111 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index d8e9ada27c..8917fef6a5 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -55,12 +56,8 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new labels needed to start the train
-        self.minimum_labels_to_start_train = 50
-        # Minum amount of new labels needed to retrain
-        self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained
-        self.last_number_of_flows_when_trained = 0
+        # Minum amount of new lables needed to trigger the train
+        self.minimum_lables_to_retrain = 50
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
@@ -71,25 +68,26 @@ def init(self):
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
-        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Get the flows from the DB
-            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
-            # Convert to pandas df
-            # self.flows = pd.DataFrame(self.flows)
-            # Process the features
-            # X_flow = self.process_features(self.flows)
+            # Process the labels to have only Normal and Malware
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*ormal.*$)", "Normal", regex=True
+            )
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*alware.*$)", "Malware", regex=True
+            )
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*alicious.*$)", "Malware", regex=True
+            )
 
-            # Create X_flow with the current flows minus the label
+            # Separate
+            y_flow = self.flows["label"]
             X_flow = self.flows.drop("label", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.label)
-            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -98,7 +96,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malicious", "Benign"]
+                    X_flow, y_flow, classes=["Malware", "Normal"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -121,7 +119,142 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train().", 0, 1)
+            self.print("Error in train()", 0, 1)
+            self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -135,11 +268,6 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
-            if dataset.empty:
-                # DataFrame is empty now, so return empty
-                return dataset
-
             # For now, discard these
             to_drop = [
                 "appproto",
@@ -152,7 +280,9 @@ def process_features(self, dataset):
                 "history",
                 "uid",
                 "dir_",
+                "dbytes",
                 "endtime",
+                "bytes",
                 "flow_source",
             ]
             for field in to_drop:
@@ -161,16 +291,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: self.db.get_final_state_from_flags(
-                    row["state"], (row["spkts"] + row["dpkts"])
-                ),
-                axis=1,
-            )
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -204,11 +330,7 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-
-            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
-            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
-
-            fields_to_convert_to_float = [
+            fields_to_convert_to_flow = [
                 dataset.proto,
                 dataset.dport,
                 dataset.sport,
@@ -219,10 +341,10 @@ def process_features(self, dataset):
                 dataset.sbytes,
                 dataset.state,
             ]
-            for field in fields_to_convert_to_float:
+            for field in fields_to_convert_to_flow:
                 try:
                     field = field.astype("float64")
-                except (ValueError, AttributeError):
+                except ValueError:
                     pass
 
             return dataset
@@ -231,9 +353,9 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_training_flows(self):
+    def process_flows(self):
         """
-        Process all the flows in the DB
+        Process all the flwos in the DB
         Store the pandas df in self.flows
         """
         try:
@@ -249,48 +371,44 @@ def process_training_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-
-                # These flows should be in the same format as the ones in the DB. 
-                # Which means the satate is still SF, S0, etc.
+                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
                 flows.append(
                     {
-                        "starttime": 1594417039.029793,
+                        "ts": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 17,
-                        "dpkts": 27,
+                        "state": "Established",
+                        "allbytes": 42764,
+                        "spkts": 37,
                         "sbytes": 25517,
-                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malicious",
+                        "label": "Malware",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
+                            "flowalerts-long-connection": "Malware"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "starttime": 1382355032.706468,
+                        "ts": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "state": "SF",
+                        "state": "Established",
+                        "allbytes": 67696,
                         "spkts": 1,
-                        "dpkts": 0,
                         "sbytes": 100,
-                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Benign",
+                        "label": "Normal",
                         "module_labels": {
-                            "flowalerts-long-connection": "Benign"
+                            "flowalerts-long-connection": "Normal"
                         },
                     }
                 )
@@ -318,8 +436,6 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
-            if dflow.empty:
-                return None
             # Update the flow to the processed version
             return dflow
         except Exception:
@@ -333,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
         and returns the predection array
         """
         try:
+            given_x_flow = x_flow
             # clean the flow
             fields_to_drop = [
                 "label",
@@ -340,28 +457,12 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "uid",
                 "history",
                 "dir_",
+                "dbytes",
+                "dpkts",
                 "endtime",
+                "bytes",
                 "flow_source",
-                "ground_truth_label",  # todo now we can use them
-                "detailed_ground_truth_label",
             ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
-            # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
-
-            # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)
@@ -373,7 +474,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             return pred
         except Exception as e:
             self.print(
-                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
             )
             self.print(traceback.format_exc(), 0, 1)
 
@@ -465,16 +566,18 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            # When a new flow arrives
             msg = json.loads(msg["data"])
-            self.twid = msg["twid"]
-            self.profileid = msg["profileid"]
+            twid = msg["twid"]
             self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
+            # these fields are expected in testing. update the original
             # flow dict to have them
             self.flow.update(
                 {
+                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
+                    # the flow["state"] is the origstate, we dont need that here
+                    # we need the interpreted state
                     "state": msg["interpreted_state"],
+                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
                     "label": msg["label"],
                     "module_labels": msg["module_labels"],
                 }
@@ -487,31 +590,23 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
-
-                # The min labels to retrain is the min number of flows 
-                # we should have seen so far in this capture to start training
-                # This is so we dont _start_ training with only 1 flow
-
-                # Once we are over the start minimum, the second condition is 
-                # to force to retrain every a minimum_labels_to_retrain number
-                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
+                    sum_labeled_flows >= self.minimum_lables_to_retrain
+                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
                 ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
-                        # So for example we retrain every 50 labels and only when
-                        # we have at least 50 labels
-                        self.print(
-                            f"Training the model with the last group of "
-                            f"flows and labels. Total flows: {sum_labeled_flows}."
-                        )
-                        # Process all flows in the DB and make them ready
-                        # for pandas
-                        self.process_training_flows()
-                        # Train an algorithm
-                        self.train()
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
-
+                    # We get here every 'self.minimum_lables_to_retrain'
+                    # amount of labels
+                    # So for example we retrain every 100 labels and only when
+                    # we have at least 100 labels
+                    self.print(
+                        f"Training the model with the last group of "
+                        f"flows and labels. Total flows: {sum_labeled_flows}."
+                    )
+                    # Process all flows in the DB and make them ready
+                    # for pandas
+                    self.process_flows()
+                    # Train an algorithm
+                    self.train()
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
@@ -531,8 +626,8 @@ def main(self):
                         # and the label is diff from the prediction,
                         # print in debug mode
                         self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f"Report Prediction {pred[0]} for label"
+                            f' {label} flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} ->'
                             f' {self.flow["daddr"]}:'
                             f'{self.flow["dport"]}/'
@@ -540,9 +635,9 @@ def main(self):
                             0,
                             3,
                         )
-                    if pred[0] == "Malicious":
+                    if pred[0] == "Malware":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, self.twid)
+                        self.set_evidence_malicious_flow(self.flow, twid)
                         self.print(
                             f"Prediction {pred[0]} for label {label}"
                             f' flow {self.flow["saddr"]}:'

From 6f548d14a61187b042083d8233a9d68f4dc9e525 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 148/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 63dc0bd420f3ad6a4390d17b5ee9ce34de8774f5 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 149/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 606fc6713ea8a9973d59696e813c708c2cdd64d6 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 150/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 8917fef6a5..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -291,12 +156,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index e8ca3aaf62..b4b2128d3d 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 3a13d07707eb85b773bcc61abd93d4d8294dc846 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 151/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..12c3589edc 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From b09101cf20d63c822cd82269e40dd9edb17ee624 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 152/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 12c3589edc..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 7c50d01107a6bac1ad4e22e67d9a56c9e75af2ca Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 153/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..12c3589edc 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From e7c5d824bac46fd7d95499f020a2183e981efdb1 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 154/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 12c3589edc..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From b350dcea25090b195f6befbf434f2b4506350b2e Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 155/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..12c3589edc 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 49ddfddfd34f0754927332be8e7b61cfa23553f3 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 156/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 12c3589edc..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 456bd7208ababe3b0081b46380466f1301f02c2f Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 157/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 163 +++++++++++++++++++--
 1 file changed, 149 insertions(+), 14 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..c8226368c7 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -8,6 +8,7 @@
 import pickle
 import pandas as pd
 import json
+import datetime
 import traceback
 import warnings
 import sys
@@ -121,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -156,17 +292,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
-                    row["state"], row["pkts"]
-                ),
-                axis=1,
-            )
-            # dataset.state = new_state_column
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -393,21 +524,25 @@ def read_model(self):
     def set_evidence_malicious_flow(self, flow: dict, twid: str):
         confidence: float = 0.1
         description = (
-            f"Flow with malicious characteristics by ML. Src IP"
+            f"Malicious flow by ML. Src IP"
             f" {flow['saddr']}:{flow['sport']} to "
             f"{flow['daddr']}:{flow['dport']}"
         )
+
+        timestamp = utils.convert_format(
+            datetime.datetime.now(), utils.alerts_format
+        )
         twid_number = int(twid.replace("timewindow", ""))
         evidence: Evidence = Evidence(
             evidence_type=EvidenceType.MALICIOUS_FLOW,
             attacker=Attacker(
                 direction=Direction.SRC,
-                ioc_type=IoCType.IP,
+                attacker_type=IoCType.IP,
                 value=flow["saddr"],
             ),
             victim=Victim(
                 direction=Direction.DST,
-                ioc_type=IoCType.IP,
+                victim_type=IoCType.IP,
                 value=flow["daddr"],
             ),
             threat_level=ThreatLevel.LOW,
@@ -416,7 +551,7 @@ def set_evidence_malicious_flow(self, flow: dict, twid: str):
             profile=ProfileID(ip=flow["saddr"]),
             timewindow=TimeWindow(twid_number),
             uid=[flow["uid"]],
-            timestamp=flow["starttime"],
+            timestamp=timestamp,
             method=Method.AI,
             src_port=flow["sport"],
             dst_port=flow["dport"],

From 592edafb650e53bd0d2bcbc5bf94e5488e2807f7 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 158/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 67 ++++++++++++++++-------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index d0a05115bd..b671a09a28 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,7 +1,9 @@
 from typing import Optional
+import sys
+import traceback
 
 
-def interpret_suricata_states(state) -> Optional[str]:
+def check_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_zeek_states(state) -> Optional[str]:
+def check_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def interpret_argus_states(state) -> Optional[str]:
+def check_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
+    suf = state.split("_")[1]
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_tcp_states(state, pkts) -> Optional[str]:
+def check_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_udp_states(state) -> Optional[str]:
+def check_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_icmp_states(state) -> Optional[str]:
+def check_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(state, pkts) -> str:
+def get_final_state_from_flags(self, state, pkts) -> str:
     """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
     """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
 
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
+        return "Not Established"
 
-    return "Not Established"
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 759e8597228c569727eb85c9c40aa5130903602f Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 159/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 07542a4d60f8828af3adb6b11de50356cd760dee Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 160/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 150 ++-------------------
 1 file changed, 10 insertions(+), 140 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c8226368c7..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -292,12 +157,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(

From 748f2d35ebab0b22a0e993f7165c7fb6140d2749 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 161/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..94eb27afdf 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 689dc79ef2926c581b2f0b9d7a4fd75a186f12ba Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 162/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 94eb27afdf..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 34ca9a52592e632e6ea5d28dd486b84c0175fee1 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 163/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 150 +++++++++++++++++++--
 1 file changed, 140 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..c8226368c7 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -157,17 +292,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
-                    row["state"], row["pkts"]
-                ),
-                axis=1,
-            )
-            # dataset.state = new_state_column
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(

From 4bd6701d5fbdc655bf1b08b34cbfd3089ea0b852 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 164/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 67 ++++++++++++++++-------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index d0a05115bd..b671a09a28 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,7 +1,9 @@
 from typing import Optional
+import sys
+import traceback
 
 
-def interpret_suricata_states(state) -> Optional[str]:
+def check_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_zeek_states(state) -> Optional[str]:
+def check_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def interpret_argus_states(state) -> Optional[str]:
+def check_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
+    suf = state.split("_")[1]
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_tcp_states(state, pkts) -> Optional[str]:
+def check_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_udp_states(state) -> Optional[str]:
+def check_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_icmp_states(state) -> Optional[str]:
+def check_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(state, pkts) -> str:
+def get_final_state_from_flags(self, state, pkts) -> str:
     """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
     """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
 
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
+        return "Not Established"
 
-    return "Not Established"
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 0fa7bb66ea522aeaa6bc7ef6a128436cc38f61d9 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 165/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 26ef89d64d54e0b89815867791b76e31164fc076 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 166/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 150 ++-------------------
 1 file changed, 10 insertions(+), 140 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c8226368c7..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -292,12 +157,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(

From e5902bd3d82d7454fabf81106c7df10f5ca2472f Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 167/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 438 +++++++++++++--------
 1 file changed, 278 insertions(+), 160 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..124ec61f91 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,8 +1,3 @@
-# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
-from typing import Optional
-
-# SPDX-License-Identifier: GPL-2.0-only
-import numpy
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler
 import pickle
@@ -10,13 +5,10 @@
 import json
 import datetime
 import traceback
-import warnings
 import sys
 
-from slips_files.common.parsers.config_parser import ConfigParser
-from slips_files.common.slips_utils import utils
-from slips_files.common.abstracts.module import IModule
-from slips_files.core.structures.evidence import (
+from slips_files.common.imports import *
+from slips_files.core.evidence_structure.evidence import (
     Evidence,
     ProfileID,
     TimeWindow,
@@ -25,8 +17,7 @@
     EvidenceType,
     IoCType,
     Direction,
-    Victim,
-    Method,
+    IDEACategory,
 )
 
 # Only for debbuging
@@ -38,6 +29,8 @@ def warn(*args, **kwargs):
     pass
 
 
+import warnings
+
 warnings.warn = warn
 
 
@@ -63,8 +56,6 @@ def init(self):
         # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
-        self.model_path = "./modules/flowmldetection/model.bin"
-        self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
     def read_configuration(self):
         conf = ConfigParser()
@@ -122,6 +113,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -130,7 +256,7 @@ def process_features(self, dataset):
         """
         try:
             # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"]
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
@@ -139,35 +265,28 @@ def process_features(self, dataset):
                 "appproto",
                 "daddr",
                 "saddr",
-                "starttime",
+                "ts",
+                "origstate",
                 "type_",
-                "smac",
-                "dmac",
-                "history",
-                "uid",
                 "dir_",
+                "history",
                 "dbytes",
-                "endtime",
-                "bytes",
-                "flow_source",
+                "dpkts",
+                "smac",
+                "dmac",
             ]
             for field in to_drop:
                 try:
                     dataset = dataset.drop(field, axis=1)
-                except (ValueError, KeyError):
+                except ValueError:
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
-                    row["state"], row["pkts"]
-                ),
-                axis=1,
-            )
-            # dataset.state = new_state_column
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -201,23 +320,42 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            fields_to_convert_to_flow = [
-                dataset.proto,
-                dataset.dport,
-                dataset.sport,
-                dataset.dur,
-                dataset.pkts,
-                dataset.spkts,
-                dataset.allbytes,
-                dataset.sbytes,
-                dataset.state,
-            ]
-            for field in fields_to_convert_to_flow:
-                try:
-                    field = field.astype("float64")
-                except ValueError:
-                    pass
-
+            dataset.proto = dataset.proto.astype("float64")
+            try:
+                # Convert dport to float
+                dataset.dport = dataset.dport.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert sport to float
+                dataset.sport = dataset.sport.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert Dur to float
+                dataset.dur = dataset.dur.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert TotPkts to float
+                dataset.pkts = dataset.pkts.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert SrcPkts to float
+                dataset.spkts = dataset.spkts.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert TotBytes to float
+                dataset.allbytes = dataset.allbytes.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert SrcBytes to float
+                dataset.sbytes = dataset.sbytes.astype("float")
+            except ValueError:
+                pass
             return dataset
         except Exception:
             # Stop the timer
@@ -233,6 +371,7 @@ def process_flows(self):
             # We get all the flows so far
             # because this retraining happens in batches
             flows = self.db.get_all_flows()
+
             # Check how many different labels are in the DB
             # We need both normal and malware
             labels = self.db.get_labels()
@@ -252,7 +391,9 @@ def process_flows(self):
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
+                        "origstate": "SRPA_SPA",
                         "state": "Established",
+                        "pkts": 84,
                         "allbytes": 42764,
                         "spkts": 37,
                         "sbytes": 25517,
@@ -272,7 +413,9 @@ def process_flows(self):
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
+                        "origstate": "SRPA_SPA",
                         "state": "Established",
+                        "pkts": 67,
                         "allbytes": 67696,
                         "spkts": 1,
                         "sbytes": 100,
@@ -298,55 +441,42 @@ def process_flows(self):
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flow(self, flow_to_process: dict):
+    def process_flow(self):
         """
         Process one flow. Only used during detection in testing
-        returns the pandas df with the processed flow
+        Store the pandas df in self.flow
         """
         try:
             # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(flow_to_process, index=[0])
+            raw_flow = pd.DataFrame(self.flow_dict, index=[0])
+            # Process features
             dflow = self.process_features(raw_flow)
             # Update the flow to the processed version
-            return dflow
+            self.flow = dflow
         except Exception:
             # Stop the timer
             self.print("Error in process_flow()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def detect(self, x_flow) -> Optional[numpy.ndarray]:
+    def detect(self):
         """
-        Detects the given flow with the current model stored
-        and returns the predection array
+        Detect this flow with the current model stored
         """
         try:
-            given_x_flow = x_flow
-            # clean the flow
-            fields_to_drop = [
-                "label",
-                "module_labels",
-                "uid",
-                "history",
-                "dir_",
-                "dbytes",
-                "dpkts",
-                "endtime",
-                "bytes",
-                "flow_source",
-            ]
-            for field in fields_to_drop:
-                try:
-                    x_flow = x_flow.drop(field, axis=1)
-                except (KeyError, ValueError):
-                    pass
+            # Store the real label if there is one
+            y_flow = self.flow["label"]
+            # remove the real label column
+            self.flow = self.flow.drop("label", axis=1)
+            # remove the label predictions column of the other modules
+            X_flow = self.flow.drop("module_labels", axis=1)
             # Scale the flow
-            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
-            pred: numpy.ndarray = self.clf.predict(x_flow)
+            X_flow = self.scaler.transform(X_flow)
+            pred = self.clf.predict(X_flow)
             return pred
-        except Exception as e:
-            self.print(
-                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
-            )
+        except Exception:
+            # Stop the timer
+            self.print("Error in detect() X_flow:")
+            self.print(X_flow)
             self.print(traceback.format_exc(), 0, 1)
 
     def store_model(self):
@@ -354,10 +484,10 @@ def store_model(self):
         Store the trained model on disk
         """
         self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open(self.model_path, "wb") as f:
+        with open("./modules/flowmldetection/model.bin", "wb") as f:
             data = pickle.dumps(self.clf)
             f.write(data)
-        with open(self.scaler_path, "wb") as g:
+        with open("./modules/flowmldetection/scaler.bin", "wb") as g:
             data = pickle.dumps(self.scaler)
             g.write(data)
 
@@ -367,23 +497,20 @@ def read_model(self):
         """
         try:
             self.print("Reading the trained model from disk.", 0, 2)
-            with open(self.model_path, "rb") as f:
+            with open("./modules/flowmldetection/model.bin", "rb") as f:
                 self.clf = pickle.load(f)
             self.print("Reading the trained scaler from disk.", 0, 2)
-            with open(self.scaler_path, "rb") as g:
+            with open("./modules/flowmldetection/scaler.bin", "rb") as g:
                 self.scaler = pickle.load(g)
         except FileNotFoundError:
             # If there is no model, create one empty
-            self.print(
-                "There was no model. " "Creating a new empty model.", 0, 2
-            )
+            self.print("There was no model. Creating a new empty model.", 0, 2)
             self.clf = SGDClassifier(
                 warm_start=True, loss="hinge", penalty="l1"
             )
         except EOFError:
             self.print(
-                "Error reading model from disk. "
-                "Creating a new empty model.",
+                "Error reading model from disk. Creating a new empty model.",
                 0,
                 2,
             )
@@ -391,40 +518,39 @@ def read_model(self):
                 warm_start=True, loss="hinge", penalty="l1"
             )
 
-    def set_evidence_malicious_flow(self, flow: dict, twid: str):
+    def set_evidence_malicious_flow(
+        self,
+        saddr: str,
+        sport: str,
+        daddr: str,
+        dport: str,
+        twid: str,
+        uid: str,
+    ):
         confidence: float = 0.1
+        ip_identification = self.db.get_ip_identification(daddr)
         description = (
-            f"Malicious flow by ML. Src IP"
-            f" {flow['saddr']}:{flow['sport']} to "
-            f"{flow['daddr']}:{flow['dport']}"
+            f"Malicious flow by ML. Src IP {saddr}:{sport} to "
+            f"{daddr}:{dport} {ip_identification}"
         )
 
         timestamp = utils.convert_format(
             datetime.datetime.now(), utils.alerts_format
         )
-        twid_number = int(twid.replace("timewindow", ""))
+
         evidence: Evidence = Evidence(
             evidence_type=EvidenceType.MALICIOUS_FLOW,
             attacker=Attacker(
-                direction=Direction.SRC,
-                attacker_type=IoCType.IP,
-                value=flow["saddr"],
-            ),
-            victim=Victim(
-                direction=Direction.DST,
-                victim_type=IoCType.IP,
-                value=flow["daddr"],
+                direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr
             ),
             threat_level=ThreatLevel.LOW,
             confidence=confidence,
             description=description,
-            profile=ProfileID(ip=flow["saddr"]),
-            timewindow=TimeWindow(twid_number),
-            uid=[flow["uid"]],
+            profile=ProfileID(ip=saddr),
+            timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))),
+            uid=[uid],
             timestamp=timestamp,
-            method=Method.AI,
-            src_port=flow["sport"],
-            dst_port=flow["dport"],
+            category=IDEACategory.ANOMALY_TRAFFIC,
         )
 
         self.db.set_evidence(evidence)
@@ -441,22 +567,20 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            msg = json.loads(msg["data"])
-            twid = msg["twid"]
-            self.flow = msg["flow"]
-            # these fields are expected in testing. update the original
-            # flow dict to have them
-            self.flow.update(
-                {
-                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
-                    # the flow["state"] is the origstate, we dont need that here
-                    # we need the interpreted state
-                    "state": msg["interpreted_state"],
-                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
-                    "label": msg["label"],
-                    "module_labels": msg["module_labels"],
-                }
-            )
+            data = msg["data"]
+            # Convert from json to dict
+            data = json.loads(data)
+            profileid = data["profileid"]
+            twid = data["twid"]
+            # Get flow that is now in json format
+            flow = data["flow"]
+            # Convert flow to a dict
+            flow = json.loads(flow)
+            # Convert the common fields to something that can
+            # be interpreted
+            # Get the uid which is the key
+            uid = next(iter(flow))
+            self.flow_dict = json.loads(flow[uid])
 
             if self.mode == "train":
                 # We are training
@@ -469,57 +593,51 @@ def main(self):
                     sum_labeled_flows >= self.minimum_lables_to_retrain
                     and sum_labeled_flows % self.minimum_lables_to_retrain == 1
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain'
-                    # amount of labels
-                    # So for example we retrain every 100 labels and only when
-                    # we have at least 100 labels
+                    # We get here every 'self.minimum_lables_to_retrain' amount of labels
+                    # So for example we retrain every 100 labels and only when we have at least 100 labels
                     self.print(
-                        f"Training the model with the last group of "
-                        f"flows and labels. Total flows: {sum_labeled_flows}."
+                        f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}."
                     )
-                    # Process all flows in the DB and make them ready
-                    # for pandas
+                    # Process all flows in the DB and make them ready for pandas
                     self.process_flows()
                     # Train an algorithm
                     self.train()
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
-                processed_flow = self.process_flow(self.flow)
+                self.process_flow()
 
-                # After processing the flow, it may happen that we
-                # delete icmp/arp/etc so the dataframe can be empty
-                if processed_flow is not None and not processed_flow.empty:
+                # After processing the flow, it may happen that we delete icmp/arp/etc
+                # so the dataframe can be empty
+                if self.flow is not None and not self.flow.empty:
                     # Predict
-                    pred: numpy.ndarray = self.detect(processed_flow)
-                    if not pred:
-                        # an error occurred
-                        return
+                    pred = self.detect()
+                    label = self.flow_dict["label"]
 
-                    label = self.flow["label"]
+                    # Report
                     if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
+                        # If the user specified a label in test mode, and the label
+                        # is diff from the prediction, print in debug mode
                         self.print(
-                            f"Report Prediction {pred[0]} for label"
-                            f' {label} flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
+                            f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
+                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
+                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
                             0,
                             3,
                         )
                     if pred[0] == "Malware":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, twid)
+                        self.set_evidence_malicious_flow(
+                            self.flow_dict["saddr"],
+                            self.flow_dict["sport"],
+                            self.flow_dict["daddr"],
+                            self.flow_dict["dport"],
+                            twid,
+                            uid,
+                        )
                         self.print(
-                            f"Prediction {pred[0]} for label {label}"
-                            f' flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} -> '
-                            f'{self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
+                            f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
+                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
+                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
                             0,
                             2,
                         )

From e5ee4b746411b114c9a96fc98aa97d130a75faee Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 168/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 67 ++++++++++++++++-------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index d0a05115bd..b671a09a28 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,7 +1,9 @@
 from typing import Optional
+import sys
+import traceback
 
 
-def interpret_suricata_states(state) -> Optional[str]:
+def check_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_zeek_states(state) -> Optional[str]:
+def check_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def interpret_argus_states(state) -> Optional[str]:
+def check_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
+    suf = state.split("_")[1]
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_tcp_states(state, pkts) -> Optional[str]:
+def check_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_udp_states(state) -> Optional[str]:
+def check_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_icmp_states(state) -> Optional[str]:
+def check_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(state, pkts) -> str:
+def get_final_state_from_flags(self, state, pkts) -> str:
     """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
     """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
 
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
+        return "Not Established"
 
-    return "Not Established"
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 840822e5c232b4b3fefa206b99b331759ff2877d Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 169/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From a30b45c3016d25e45c6038b66e25eb155c6a72c3 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 170/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 169 +++------------------
 1 file changed, 19 insertions(+), 150 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 124ec61f91..c57a7a3581 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -5,9 +5,13 @@
 import json
 import datetime
 import traceback
-import sys
+import warnings
+
 
-from slips_files.common.imports import *
+from slips_files.common.state_handler import get_final_state_from_flags
+from slips_files.common.parsers.config_parser import ConfigParser
+from slips_files.common.slips_utils import utils
+from slips_files.common.abstracts.module import IModule
 from slips_files.core.evidence_structure.evidence import (
     Evidence,
     ProfileID,
@@ -29,8 +33,6 @@ def warn(*args, **kwargs):
     pass
 
 
-import warnings
-
 warnings.warn = warn
 
 
@@ -113,141 +115,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -281,12 +148,17 @@ def process_features(self, dataset):
                 except ValueError:
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -370,7 +242,7 @@ def process_flows(self):
         try:
             # We get all the flows so far
             # because this retraining happens in batches
-            flows = self.db.get_all_flows()
+            flows: list = self.db.get_all_flows()
 
             # Check how many different labels are in the DB
             # We need both normal and malware
@@ -464,7 +336,7 @@ def detect(self):
         """
         try:
             # Store the real label if there is one
-            y_flow = self.flow["label"]
+            # y_flow = self.flow["label"]
             # remove the real label column
             self.flow = self.flow.drop("label", axis=1)
             # remove the label predictions column of the other modules
@@ -568,13 +440,10 @@ def pre_main(self):
     def main(self):
         if msg := self.get_msg("new_flow"):
             data = msg["data"]
-            # Convert from json to dict
             data = json.loads(data)
-            profileid = data["profileid"]
+            # profileid = data["profileid"]
             twid = data["twid"]
-            # Get flow that is now in json format
             flow = data["flow"]
-            # Convert flow to a dict
             flow = json.loads(flow)
             # Convert the common fields to something that can
             # be interpreted

From bf4c8cf95ca6cfa2d28ca270560e9001fd6f127c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:36:55 +0200
Subject: [PATCH 171/455] mlflow. Ignore UID column

---
 modules/flowmldetection/flowmldetection.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c57a7a3581..e2aa1e0ee3 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -141,6 +141,7 @@ def process_features(self, dataset):
                 "dpkts",
                 "smac",
                 "dmac",
+                "uid",
             ]
             for field in to_drop:
                 try:

From 59a109713f00126acf7633e9435156c49b5ec580 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:23:29 +0100
Subject: [PATCH 172/455] Re add function that alya added

---
 slips_files/core/database/database_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index b4b2128d3d..e8ca3aaf62 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
+    def get_final_state_from_flags(self, *args, **kwargs):
+        return self.rdb.get_final_state_from_flags(*args, **kwargs)
+
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From bb582a55c2a8460bcf408204dc175207b2499682 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:27:23 +0100
Subject: [PATCH 173/455] Delete file that was deleted from develop

---
 slips_files/common/state_handler.py | 170 ----------------------------
 1 file changed, 170 deletions(-)
 delete mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
deleted file mode 100644
index d0a05115bd..0000000000
--- a/slips_files/common/state_handler.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Optional
-
-
-def interpret_suricata_states(state) -> Optional[str]:
-    """
-    There are different states in which a flow can be.
-    Suricata distinguishes three flow-states for TCP and two for
-     UDP. For TCP,
-    these are: New, Established and Closed,for UDP only new and
-    established.
-    For each of these states Suricata can employ different timeouts.
-    """
-    if "new" in state or "established" in state:
-        return "Established"
-    elif "closed" in state:
-        return "Not Established"
-
-
-def interpret_zeek_states(state) -> Optional[str]:
-    # We have varius type of states depending on the type of flow.
-    # For Zeek
-    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-        return "Not Established"
-    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-        return "Established"
-
-
-def interpret_argus_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
-    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-        """
-        Examples:
-        SA_SA
-        SR_SA
-        FSRA_SA
-        SPA_SPA
-        SRA_SPA
-        FSA_FSA
-        FSA_FSPA
-        SAEC_SPA
-        SRPA_SPA
-        FSPA_SPA
-        FSRPA_SPA
-        FSPA_FSPA
-        FSRA_FSPA
-        SRAEC_SPA
-        FSPA_FSRPA
-        FSAEC_FSPA
-        FSRPA_FSPA
-        SRPAEC_SPA
-        FSPAEC_FSPA
-        SRPAEC_FSRPA
-        """
-        return "Established"
-    elif "PA" in pre and "PA" in suf:
-        # Tipical flow that was reported in the middle
-        """
-        Examples:
-        PA_PA
-        FPA_FPA
-        """
-        return "Established"
-    elif "ECO" in pre:
-        return "ICMP Echo"
-    elif "ECR" in pre:
-        return "ICMP Reply"
-    elif "URH" in pre:
-        return "ICMP Host Unreachable"
-    elif "URP" in pre:
-        return "ICMP Port Unreachable"
-    else:
-        """
-        Examples:
-        S_RA
-        S_R
-        A_R
-        S_SA
-        SR_SA
-        FA_FA
-        SR_RA
-        SEC_RA
-        """
-        return "Not Established"
-
-
-def interpret_tcp_states(state, pkts) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "EST" in pre:
-        # TCP
-        return "Established"
-    elif "RST" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are reseted when finished and therefore are
-        # established
-        # It can happen that is reseted being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    elif "FIN" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are finished with FIN when finished and
-        # therefore are established
-        # It can happen that is finished being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    else:
-        """
-        Examples:
-        S_
-        FA_
-        PA_
-        FSA_
-        SEC_
-        SRPA_
-        """
-        return "Not Established"
-
-
-def interpret_udp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "CON" in pre:
-        # UDP
-        return "Established"
-    elif "INT" in pre:
-        # UDP trying to connect, NOT preciselly not established but also
-        # NOT 'Established'. So we considered not established because there
-        # is no confirmation of what happened.
-        return "Not Established"
-
-
-def interpret_icmp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "ECO" in pre:
-        # ICMP
-        return "Established"
-    elif "UNK" in pre:
-        # ICMP6 unknown upper layer
-        return "Established"
-
-
-def get_final_state_from_flags(state, pkts) -> str:
-    """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
-    """
-
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
-
-    return "Not Established"

From b586ac78776b01465a9476771ccec69b3df635c3 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:32:01 +0100
Subject: [PATCH 174/455] Flowmldetection. Fix missing db reference

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e2aa1e0ee3..9269b67012 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -154,7 +154,7 @@ def process_features(self, dataset):
             # 'Not Established', it is still 'S0' and others
             # So transform here
             dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
+                lambda row: self.db.get_final_state_from_flags(
                     row["state"], row["pkts"]
                 ),
                 axis=1,

From 5ccc0dd3da3eb9f31c5b4a2ab5dbdf89e9b32898 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:08 +0100
Subject: [PATCH 175/455] Fix the training of flows with ML in new version

---
 modules/flowmldetection/flowmldetection.py | 378 +++++++++++----------
 1 file changed, 197 insertions(+), 181 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9269b67012..e6ea0b5171 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,18 +1,20 @@
+# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
+from typing import Optional
+
+# SPDX-License-Identifier: GPL-2.0-only
+import numpy
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler
 import pickle
 import pandas as pd
 import json
-import datetime
 import traceback
 import warnings
 
-
-from slips_files.common.state_handler import get_final_state_from_flags
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
 from slips_files.common.abstracts.module import IModule
-from slips_files.core.evidence_structure.evidence import (
+from slips_files.core.structures.evidence import (
     Evidence,
     ProfileID,
     TimeWindow,
@@ -21,7 +23,8 @@
     EvidenceType,
     IoCType,
     Direction,
-    IDEACategory,
+    Victim,
+    Method,
 )
 
 # Only for debbuging
@@ -52,36 +55,41 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new lables needed to trigger the train
-        self.minimum_lables_to_retrain = 50
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained
+        self.last_number_of_flows_when_trained = 0
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
+        self.model_path = "./modules/flowmldetection/model.bin"
+        self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Process the labels to have only Normal and Malware
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*ormal.*$)", "Normal", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alware.*$)", "Malware", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alicious.*$)", "Malware", regex=True
-            )
+            # Get the flows from the DB
+            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
+            # Convert to pandas df
+            # self.flows = pd.DataFrame(self.flows)
+            # Process the features
+            # X_flow = self.process_features(self.flows)
 
-            # Separate
-            y_flow = self.flows["label"]
+            # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("label", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.label)
+            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -90,7 +98,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malware", "Normal"]
+                    X_flow, y_flow, classes=["Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -113,7 +121,7 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train()", 0, 1)
+            self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -123,7 +131,7 @@ def process_features(self, dataset):
         """
         try:
             # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"]
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
@@ -132,21 +140,20 @@ def process_features(self, dataset):
                 "appproto",
                 "daddr",
                 "saddr",
-                "ts",
-                "origstate",
+                "starttime",
                 "type_",
-                "dir_",
-                "history",
-                "dbytes",
-                "dpkts",
                 "smac",
                 "dmac",
+                "history",
                 "uid",
+                "dir_",
+                "endtime",
+                "flow_source",
             ]
             for field in to_drop:
                 try:
                     dataset = dataset.drop(field, axis=1)
-                except ValueError:
+                except (ValueError, KeyError):
                     pass
 
             # When flows are read from Slips sqlite,
@@ -155,11 +162,10 @@ def process_features(self, dataset):
             # So transform here
             dataset["state"] = dataset.apply(
                 lambda row: self.db.get_final_state_from_flags(
-                    row["state"], row["pkts"]
+                    row["state"], (row["spkts"] + row["dpkts"])
                 ),
                 axis=1,
             )
-            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -193,58 +199,42 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            dataset.proto = dataset.proto.astype("float64")
-            try:
-                # Convert dport to float
-                dataset.dport = dataset.dport.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert sport to float
-                dataset.sport = dataset.sport.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert Dur to float
-                dataset.dur = dataset.dur.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert TotPkts to float
-                dataset.pkts = dataset.pkts.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert SrcPkts to float
-                dataset.spkts = dataset.spkts.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert TotBytes to float
-                dataset.allbytes = dataset.allbytes.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert SrcBytes to float
-                dataset.sbytes = dataset.sbytes.astype("float")
-            except ValueError:
-                pass
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
+                dataset.proto,
+                dataset.dport,
+                dataset.sport,
+                dataset.dur,
+                dataset.pkts,
+                dataset.spkts,
+                dataset.allbytes,
+                dataset.sbytes,
+                dataset.state,
+            ]
+            for field in fields_to_convert_to_float:
+                try:
+                    field = field.astype("float64")
+                except (ValueError, AttributeError):
+                    pass
+
             return dataset
         except Exception:
             # Stop the timer
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flows(self):
+    def process_training_flows(self):
         """
-        Process all the flwos in the DB
+        Process all the flows in the DB
         Store the pandas df in self.flows
         """
         try:
             # We get all the flows so far
             # because this retraining happens in batches
-            flows: list = self.db.get_all_flows()
-
+            flows = self.db.get_all_flows()
             # Check how many different labels are in the DB
             # We need both normal and malware
             labels = self.db.get_labels()
@@ -254,48 +244,48 @@ def process_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
+
+                # These flows should be in the same format as the ones in the DB. 
+                # Which means the satate is still SF, S0, etc.
                 flows.append(
                     {
-                        "ts": 1594417039.029793,
+                        "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "origstate": "SRPA_SPA",
-                        "state": "Established",
-                        "pkts": 84,
-                        "allbytes": 42764,
-                        "spkts": 37,
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
                         "sbytes": 25517,
+                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malware",
+                        "label": "Malicious",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malware"
+                            "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "ts": 1382355032.706468,
+                        "starttime": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "origstate": "SRPA_SPA",
-                        "state": "Established",
-                        "pkts": 67,
-                        "allbytes": 67696,
+                        "state": "SF",
                         "spkts": 1,
+                        "dpkts": 0,
                         "sbytes": 100,
+                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Normal",
+                        "label": "Benign",
                         "module_labels": {
-                            "flowalerts-long-connection": "Normal"
+                            "flowalerts-long-connection": "Benign"
                         },
                     }
                 )
@@ -314,42 +304,51 @@ def process_flows(self):
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flow(self):
+    def process_flow(self, flow_to_process: dict):
         """
         Process one flow. Only used during detection in testing
-        Store the pandas df in self.flow
+        returns the pandas df with the processed flow
         """
         try:
             # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(self.flow_dict, index=[0])
-            # Process features
+            raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
             # Update the flow to the processed version
-            self.flow = dflow
+            return dflow
         except Exception:
             # Stop the timer
             self.print("Error in process_flow()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def detect(self):
+    def detect(self, x_flow) -> Optional[numpy.ndarray]:
         """
-        Detect this flow with the current model stored
+        Detects the given flow with the current model stored
+        and returns the predection array
         """
         try:
-            # Store the real label if there is one
-            # y_flow = self.flow["label"]
-            # remove the real label column
-            self.flow = self.flow.drop("label", axis=1)
-            # remove the label predictions column of the other modules
-            X_flow = self.flow.drop("module_labels", axis=1)
+            # clean the flow
+            fields_to_drop = [
+                "label",
+                "module_labels",
+                "uid",
+                "history",
+                "dir_",
+                "endtime",
+                "flow_source",
+            ]
+            for field in fields_to_drop:
+                try:
+                    x_flow = x_flow.drop(field, axis=1)
+                except (KeyError, ValueError):
+                    pass
             # Scale the flow
-            X_flow = self.scaler.transform(X_flow)
-            pred = self.clf.predict(X_flow)
+            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
+            pred: numpy.ndarray = self.clf.predict(x_flow)
             return pred
-        except Exception:
-            # Stop the timer
-            self.print("Error in detect() X_flow:")
-            self.print(X_flow)
+        except Exception as e:
+            self.print(
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+            )
             self.print(traceback.format_exc(), 0, 1)
 
     def store_model(self):
@@ -357,10 +356,10 @@ def store_model(self):
         Store the trained model on disk
         """
         self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open("./modules/flowmldetection/model.bin", "wb") as f:
+        with open(self.model_path, "wb") as f:
             data = pickle.dumps(self.clf)
             f.write(data)
-        with open("./modules/flowmldetection/scaler.bin", "wb") as g:
+        with open(self.scaler_path, "wb") as g:
             data = pickle.dumps(self.scaler)
             g.write(data)
 
@@ -370,20 +369,23 @@ def read_model(self):
         """
         try:
             self.print("Reading the trained model from disk.", 0, 2)
-            with open("./modules/flowmldetection/model.bin", "rb") as f:
+            with open(self.model_path, "rb") as f:
                 self.clf = pickle.load(f)
             self.print("Reading the trained scaler from disk.", 0, 2)
-            with open("./modules/flowmldetection/scaler.bin", "rb") as g:
+            with open(self.scaler_path, "rb") as g:
                 self.scaler = pickle.load(g)
         except FileNotFoundError:
             # If there is no model, create one empty
-            self.print("There was no model. Creating a new empty model.", 0, 2)
+            self.print(
+                "There was no model. " "Creating a new empty model.", 0, 2
+            )
             self.clf = SGDClassifier(
                 warm_start=True, loss="hinge", penalty="l1"
             )
         except EOFError:
             self.print(
-                "Error reading model from disk. Creating a new empty model.",
+                "Error reading model from disk. "
+                "Creating a new empty model.",
                 0,
                 2,
             )
@@ -391,39 +393,36 @@ def read_model(self):
                 warm_start=True, loss="hinge", penalty="l1"
             )
 
-    def set_evidence_malicious_flow(
-        self,
-        saddr: str,
-        sport: str,
-        daddr: str,
-        dport: str,
-        twid: str,
-        uid: str,
-    ):
+    def set_evidence_malicious_flow(self, flow: dict, twid: str):
         confidence: float = 0.1
-        ip_identification = self.db.get_ip_identification(daddr)
         description = (
-            f"Malicious flow by ML. Src IP {saddr}:{sport} to "
-            f"{daddr}:{dport} {ip_identification}"
-        )
-
-        timestamp = utils.convert_format(
-            datetime.datetime.now(), utils.alerts_format
+            f"Flow with malicious characteristics by ML. Src IP"
+            f" {flow['saddr']}:{flow['sport']} to "
+            f"{flow['daddr']}:{flow['dport']}"
         )
-
+        twid_number = int(twid.replace("timewindow", ""))
         evidence: Evidence = Evidence(
             evidence_type=EvidenceType.MALICIOUS_FLOW,
             attacker=Attacker(
-                direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr
+                direction=Direction.SRC,
+                ioc_type=IoCType.IP,
+                value=flow["saddr"],
+            ),
+            victim=Victim(
+                direction=Direction.DST,
+                ioc_type=IoCType.IP,
+                value=flow["daddr"],
             ),
             threat_level=ThreatLevel.LOW,
             confidence=confidence,
             description=description,
-            profile=ProfileID(ip=saddr),
-            timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))),
-            uid=[uid],
-            timestamp=timestamp,
-            category=IDEACategory.ANOMALY_TRAFFIC,
+            profile=ProfileID(ip=flow["saddr"]),
+            timewindow=TimeWindow(twid_number),
+            uid=[flow["uid"]],
+            timestamp=flow["starttime"],
+            method=Method.AI,
+            src_port=flow["sport"],
+            dst_port=flow["dport"],
         )
 
         self.db.set_evidence(evidence)
@@ -440,17 +439,20 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            data = msg["data"]
-            data = json.loads(data)
-            # profileid = data["profileid"]
-            twid = data["twid"]
-            flow = data["flow"]
-            flow = json.loads(flow)
-            # Convert the common fields to something that can
-            # be interpreted
-            # Get the uid which is the key
-            uid = next(iter(flow))
-            self.flow_dict = json.loads(flow[uid])
+            # When a new flow arrives
+            msg = json.loads(msg["data"])
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
+            self.flow = msg["flow"]
+            # These following extra fields are expected in testing. update the original
+            # flow dict to have them
+            self.flow.update(
+                {
+                    "state": msg["interpreted_state"],
+                    "label": msg["label"],
+                    "module_labels": msg["module_labels"],
+                }
+            )
 
             if self.mode == "train":
                 # We are training
@@ -459,55 +461,69 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_lables_to_retrain
-                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain' amount of labels
-                    # So for example we retrain every 100 labels and only when we have at least 100 labels
-                    self.print(
-                        f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}."
-                    )
-                    # Process all flows in the DB and make them ready for pandas
-                    self.process_flows()
-                    # Train an algorithm
-                    self.train()
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows()
+                        # Train an algorithm
+                        self.train()
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
-                self.process_flow()
+                processed_flow = self.process_flow(self.flow)
 
-                # After processing the flow, it may happen that we delete icmp/arp/etc
-                # so the dataframe can be empty
-                if self.flow is not None and not self.flow.empty:
+                # After processing the flow, it may happen that we
+                # delete icmp/arp/etc so the dataframe can be empty
+                if processed_flow is not None and not processed_flow.empty:
                     # Predict
-                    pred = self.detect()
-                    label = self.flow_dict["label"]
+                    pred: numpy.ndarray = self.detect(processed_flow)
+                    if not pred:
+                        # an error occurred
+                        return
 
-                    # Report
+                    label = self.flow["label"]
                     if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode, and the label
-                        # is diff from the prediction, print in debug mode
+                        # If the user specified a label in test mode,
+                        # and the label is diff from the prediction,
+                        # print in debug mode
                         self.print(
-                            f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
-                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
-                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} ->'
+                            f' {self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
                             0,
                             3,
                         )
-                    if pred[0] == "Malware":
+                    if pred[0] == "Malicious":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(
-                            self.flow_dict["saddr"],
-                            self.flow_dict["sport"],
-                            self.flow_dict["daddr"],
-                            self.flow_dict["dport"],
-                            twid,
-                            uid,
-                        )
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
-                            f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
-                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
-                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
+                            f"Prediction {pred[0]} for label {label}"
+                            f' flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} -> '
+                            f'{self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
                             0,
                             2,
                         )

From fe91a3c6a427b86f3957864dcdea67a52b7a861d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:22:38 +0100
Subject: [PATCH 176/455] flowml. If the dataset has one flow and that is
 deleted, then return empty fast.

---
 modules/flowmldetection/flowmldetection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e6ea0b5171..0fa1e4d767 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -135,6 +135,11 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
             # For now, discard these
             to_drop = [
                 "appproto",

From 31d8b921d59719a665de7b0195eeac37e2ad7d81 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:23:05 +0100
Subject: [PATCH 177/455] flowml. If the datasert is empty. Return none

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 0fa1e4d767..5c5f9943f1 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
             # Update the flow to the processed version
             return dflow
         except Exception:

From 689b570abe330277d9af665e0d99b6ae2354d384 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:27:16 +0100
Subject: [PATCH 178/455] First new version of the model and scaler. Not good
 yet, but working.

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1090 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644
GIT binary patch
delta 130
zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
kK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162Zxn3FXGFXLA@Pyhe`

delta 131
zcmV-}0DS+#2;>L^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8
z4q<hm<v;E**(1biC&1;CKfROVA3%9ur;SK<>_05T(v)g91VHfmFeIMvRKFpJJ~89v
lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl#

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK


From b0dd41875bad025a20248672f9c040d8f3ec8d71 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:06 +0100
Subject: [PATCH 179/455] model and scaler with 1 malicious and 1 benign

---
 modules/flowmldetection/model.bin  | Bin 1090 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644
GIT binary patch
delta 132
zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSY<wU!LGn&}gUNx$3tLYnGl@A&e4S}J
zBSY*!uI-OaTr&&~Tw#uuJJhD?&}O=|yn5R|`<q--?Rc;A9e69=F=gK7d-lipbZtG?
j>K$N!g2|oC+8oRs(l<p`Iv<$4hdGq_%F%|&A}o3UHQY43

delta 131
zcmV-}0DS-C2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
lK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162cym;jSD1TGrII8guq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index bfba4d107224e5e6e5a1e8c8f4d463b48131d111..758909b289238ff282b2e056a9b3e83768b8472a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi%8H3jhEB

delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC


From 8f620155cdebb7d5795ef33c8051431504ccca39 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:27 +0100
Subject: [PATCH 180/455] cleaner jupyter

---
 modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 5c5f9943f1..fe950ed4bb 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "endtime",
                 "flow_source",
             ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)

From fba965a9409ff15bbb4ed677fe658f85c1b1b02a Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 22:26:27 +0100
Subject: [PATCH 181/455] New models after 3rd ttrain

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644
GIT binary patch
delta 99
zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~Cof{2$owSaz+^KPJplbX
BD~|vG

delta 99
zcmV-p0G$8i2;>N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q;
zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG
F1TN2UERg^J

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644
GIT binary patch
delta 43
zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G|lbQl%0g;md19^W6
B5HkP(

delta 43
zcmV+`0M!5b2KolDfdU!a4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G}lbQl%0g#gc19^VR
B5HSD%


From 27fcc670e4f290b9eabfa110b7418e6fd564c2aa Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 00:08:50 +0100
Subject: [PATCH 182/455] Models after 4th train

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644
GIT binary patch
delta 120
zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
as{<<t5IC<!D(NIZlimYX0{noJHUuueR5UOE

delta 120
zcmaFD@q}YTFmr_2vYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~;rz*~nYB5XJEU)ltaLsw
P`5to^Q^<kIW-NLDlQb|V

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 821344a0c69d116622b02e2a0daa1554cb5d308e..29df65342047c5a499ee3f8e602d1f47cb7e9fca 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8kA
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi>ty4FCWD


From 844a04314f76516c8ec2afaf8c3cc040955c62a2 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 08:28:59 +0100
Subject: [PATCH 183/455] Models of ml flow with the first good performance in
 small tests

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644
GIT binary patch
delta 121
zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1();
zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y<G(@KnDch(0mn<s!fm-ctwS<Lp0^
bs{<<tAstXx>?b6^limYW1McwvlQsk{8#y@u

delta 121
zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
bs{<<t5IC<!D(NIZlimYW1LXXGlQsk{_eeDq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 29df65342047c5a499ee3f8e602d1f47cb7e9fca..17115724b9536f6093f9d72f3b58a5c22c562a9a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK

delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T


From 70c222ea6b8661e903dfc4ae93855d8ee2614ca5 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:49:23 +0000
Subject: [PATCH 184/455] Add plot for flowml train scores

---
 modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 modules/flowmldetection/plot_train_score.py

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
new file mode 100644
index 0000000000..0b5b5b72ba
--- /dev/null
+++ b/modules/flowmldetection/plot_train_score.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import re
+import sys
+
+def plot_log_data(file_path):
+    # Read the log data from the file
+    with open(file_path, 'r') as file:
+        log_data = file.read()
+
+    # Define regex pattern to extract relevant data from each line
+    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
+
+    # Parse the log file
+    data = re.findall(pattern, log_data)
+
+    # Convert data to a DataFrame
+    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
+    df = df.astype({
+        "Background": int,
+        "Benign": int,
+        "Malicious": int,
+        "Total labels": float,
+        "Score": float
+    })
+
+    # Plotting the values
+    fig, ax1 = plt.subplots(figsize=(10, 6))
+
+    # Plotting Score on the left y-axis
+    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
+    ax1.set_xlabel('Index')
+    ax1.set_ylabel('Score', color='tab:blue')
+    ax1.tick_params(axis='y', labelcolor='tab:blue')
+
+    # Create the second y-axis for the Total labels
+    ax2 = ax1.twinx()
+    ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
+    ax2.set_ylabel('Total labels', color='tab:red')
+    ax2.tick_params(axis='y', labelcolor='tab:red')
+
+    # Adding title and legend
+    plt.title('Log Data Visualization')
+    fig.tight_layout()
+
+    # Save plot to a PNG file
+    plt.savefig('log_data_plot_with_two_scales.png')
+
+    # Display the plot
+    plt.show()
+
+# Make sure the file path is passed as an argument
+if len(sys.argv) < 2:
+    print("Please provide the path to the log file as a parameter.")
+else:
+    plot_log_data(sys.argv[1])

From a721639f4e90e0db5e9464b7fda27454e305ab5f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:04 +0000
Subject: [PATCH 185/455] Add a log file to store the training data output

---
 modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fe950ed4bb..60217ada28 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -68,12 +68,29 @@ def init(self):
         self.model_path = "./modules/flowmldetection/model.bin"
         self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
+        # Initialize the training log file
+        self.training_log_path = "./modules/flowmldetection/training.log"
+        with open(self.training_log_path, "w") as log_file:
+            log_file.write("Training Log Initialized\n")
+
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        # This is the global label in the configuration,
+        # in case the flows do not have a label themselves
         self.label = conf.label()
 
-    def train(self):
+    def write_to_training_log(self, message: str):
+        """
+        Write a message to the training log file.
+        """
+        try:
+            with open(self.training_log_path, "a") as log_file:
+                log_file.write(message + "\n")
+        except Exception as e:
+            self.print(f"Error writing to training log: {e}", 0, 1)
+
+    def train(self, sum_labeled_flows):
         """
         Train a model based on the flows we receive and the labels
         """

From 2d65486fa55caae847d9cfb709e8aedf57b2b7d6 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:32 +0000
Subject: [PATCH 186/455] Store data in the log file of training

---
 modules/flowmldetection/flowmldetection.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 60217ada28..6f732da636 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -137,9 +137,13 @@ def train(self, sum_labeled_flows):
             # Store the models on disk
             self.store_model()
 
+            # Log training information
+            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
+            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+            self.write_to_training_log("Error occurred during training.")
 
     def process_features(self, dataset):
         """

From b0324a55a34f5e2f5780bfb755863fbe6662dcc7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:53 +0000
Subject: [PATCH 187/455] better comments

---
 modules/flowmldetection/flowmldetection.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6f732da636..ed3aecf1b0 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -59,10 +59,9 @@ def init(self):
         self.minimum_labels_to_start_train = 50
         # Minum amount of new labels needed to retrain
         self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained
+        # The number of flows when last trained. Used internally only to know
+        # when to retrain
         self.last_number_of_flows_when_trained = 0
-        # To plot the scores of training
-        # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
         self.model_path = "./modules/flowmldetection/model.bin"

From 1e91a10fa051a06cb27ebf5e9e0c505fe4210f32 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:51:30 +0000
Subject: [PATCH 188/455] Fix issue not dropping detailed labels

---
 modules/flowmldetection/flowmldetection.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index ed3aecf1b0..25b30cf515 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -94,23 +94,19 @@ def train(self, sum_labeled_flows):
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Get the flows from the DB
-            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
-            # Convert to pandas df
-            # self.flows = pd.DataFrame(self.flows)
-            # Process the features
-            # X_flow = self.process_features(self.flows)
-
             # Create X_flow with the current flows minus the label
-            X_flow = self.flows.drop("label", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.label)
+            X_flow = self.flows.drop("ground_truth_label", axis=1)
+            # Drop the detailed labels
+            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
             # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
 
             # Normalize this batch of data so far. This can get progressivle slow
             X_flow = self.scaler.fit_transform(X_flow)
 
+
             # Train
             try:
                 self.clf.partial_fit(

From d97a4ddb3e8af4bee1cbe98d980e55fe5b8f8139 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:51:53 +0000
Subject: [PATCH 189/455] Fix issue that not all labels sere given to the
 partial fit

---
 modules/flowmldetection/flowmldetection.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 25b30cf515..b2d0db5e51 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -109,8 +109,9 @@ def train(self, sum_labeled_flows):
 
             # Train
             try:
+                # Online incremental learning
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malicious", "Benign"]
+                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")

From 10560192bfae39975002f518114f03ad2d56ed83 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:52:08 +0000
Subject: [PATCH 190/455] count partial labels in this epoch

---
 modules/flowmldetection/flowmldetection.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index b2d0db5e51..1146091a92 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -106,6 +106,12 @@ def train(self, sum_labeled_flows):
             # Normalize this batch of data so far. This can get progressivle slow
             X_flow = self.scaler.fit_transform(X_flow)
 
+            # Count the number of labels of each type in this epoc
+            epoch_label_counts = {
+                "Background": (y_flow == "Background").sum(),
+                "Malicious": (y_flow == "Malicious").sum(),
+                "Benign": (y_flow == "Benign").sum(),
+            }
 
             # Train
             try:

From 2a61b4608e234655f284cac29951f33c756bc7f9 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:55:09 +0000
Subject: [PATCH 191/455] Dont print training in screen

---
 modules/flowmldetection/flowmldetection.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 1146091a92..4bb2ad7dbf 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -126,15 +126,8 @@ def train(self, sum_labeled_flows):
             # See score so far in training
             score = self.clf.score(X_flow, y_flow)
 
-            # To debug the training score
-            # self.scores.append(score)
-
-            self.print(f"	Training Score: {score}", 0, 1)
-            # self.print(f'    Model Parameters: {self.clf.coef_}')
-
-            # Debug code to store a plot in a png of the scores
-            # plt.plot(self.scores)
-            # plt.savefig('train-scores.png')
+            #self.print(f"	Training Score: {score}", 1, 0)
+            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
 
             # Store the models on disk
             self.store_model()

From eef7992b26c5e8ff0db0ec8c14ce9bd3064f7fd6 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:55:28 +0000
Subject: [PATCH 192/455] Add function to write to train log

---
 modules/flowmldetection/flowmldetection.py | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4bb2ad7dbf..d4b2762f5f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -247,28 +247,28 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_training_flows(self):
+    def process_training_flows(self, last_number_of_flows_when_trained):
         """
-        Process all the flows in the DB
+        Process only the new flows in the DB since the last training.
         Store the pandas df in self.flows
         """
         try:
+            # Ensure the index is an integer
+            if last_number_of_flows_when_trained is None:
+                last_number_of_flows_when_trained = 0
+            else:
+                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+
             # We get all the flows so far
-            # because this retraining happens in batches
             flows = self.db.get_all_flows()
-            # Check how many different labels are in the DB
-            # We need both normal and malware
+            # Only process new flows since last training
+            new_flows = flows[last_number_of_flows_when_trained:]
+
+            # Check how many **different** labels are in the DB
             labels = self.db.get_labels()
             if len(labels) == 1:
-                # Only 1 label has flows
-                # There are not enough different labels, so insert two flows
-                # that are fake but representative of a normal and malware flow
-                # they are only for the training process
-                # At least 1 flow of each label is required
-
-                # These flows should be in the same format as the ones in the DB. 
-                # Which means the satate is still SF, S0, etc.
-                flows.append(
+                # Insert fake flows for both classes if needed
+                new_flows.append(
                     {
                         "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",

From b253aecbdf6797bee21511fc6faa84f0dcf6dd08 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:57:27 +0000
Subject: [PATCH 193/455] Fix label in dummy flow

---
 modules/flowmldetection/flowmldetection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index d4b2762f5f..6a44422cc2 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 25517,
                         "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malicious",
+                        "ground_truth_label": "Malicious",
                         "module_labels": {
                             "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
-                flows.append(
+                new_flows.append(
                     {
                         "starttime": 1382355032.706468,
                         "dur": "10.896695",

From 8b5dccc0afc99f5a2bd1c6175d034b890135178d Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:57:39 +0000
Subject: [PATCH 194/455] Fix dummy flow

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6a44422cc2..20f1f8ca89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 100,
                         "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Benign",
+                        "ground_truth_label": "Benign",
                         "module_labels": {
                             "flowalerts-long-connection": "Benign"
                         },

From 11fb0096098f3ac57267593712f8b545b1ca84a2 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:58:28 +0000
Subject: [PATCH 195/455] Rename variable

---
 modules/flowmldetection/flowmldetection.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 20f1f8ca89..59064d61a5 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         },
                     }
                 )
-                # If there are enough flows, we dont insert them anymore
 
             # Convert to pandas df
-            df_flows = pd.DataFrame(flows)
+            df_flows = pd.DataFrame(new_flows)
 
             # Process features
             df_flows = self.process_features(df_flows)
@@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained):
             # Update the flow to the processed version
             self.flows = df_flows
         except Exception:
-            # Stop the timer
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 

From 1acb03086bc424093508484dfa70176c696f8777 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:00:32 +0000
Subject: [PATCH 196/455] Fix dummy flow label

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 59064d61a5..6b41b40298 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -356,6 +356,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "dir_",
                 "endtime",
                 "flow_source",
+                "ground_truth_label",
+                "detailed_ground_truth_label",
             ]
             # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
             # Error

From 5f61978998876e7e30511a2e7a378bf914ec022a Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:00:47 +0000
Subject: [PATCH 197/455] Pass values to train function

---
 modules/flowmldetection/flowmldetection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6b41b40298..4d66aab855 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -521,9 +521,9 @@ def main(self):
                         )
                         # Process all flows in the DB and make them ready
                         # for pandas
-                        self.process_training_flows()
+                        self.process_training_flows(self.last_number_of_flows_when_trained)
                         # Train an algorithm
-                        self.train()
+                        self.train(sum_labeled_flows)
                         self.last_number_of_flows_when_trained = sum_labeled_flows
 
             elif self.mode == "test":

From 4a486284e59952de7c793ee55cd2e627fd7f2830 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:01:47 +0000
Subject: [PATCH 198/455] import os

---
 modules/flowmldetection/flowmldetection.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4d66aab855..766178e127 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import os
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils

From 19b5bdde44678c80365f8c6aeda8b9d3b67f7a6f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:02:15 +0000
Subject: [PATCH 199/455] Get issue of total flows zero

---
 slips_files/core/database/database_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index e8ca3aaf62..892b923b4a 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -661,7 +661,8 @@ def add_software_to_profile(self, *args, **kwargs):
         return self.rdb.add_software_to_profile(*args, **kwargs)
 
     def get_total_flows(self, *args, **kwargs):
-        return int(self.rdb.get_total_flows(*args, **kwargs))
+        total_flows = self.rdb.get_total_flows(*args, **kwargs)
+        return int(total_flows) if total_flows is not None else 0
 
     def increment_processed_flows(self, *args, **kwargs):
         return self.rdb.increment_processed_flows(*args, **kwargs)

From cf87d4260a971d8e81d1474b0d0968dba12e68b7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:02:32 +0000
Subject: [PATCH 200/455] Add comments

---
 slips_files/core/database/database_manager.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 892b923b4a..6dd1d9952e 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -879,7 +879,10 @@ def get_flow(self, *args, **kwargs):
         """returns the raw flow as read from the log file"""
         return self.sqlite.get_flow(*args, **kwargs)
 
-    def add_flow(self, flow, profileid: str, twid: str, label="benign"):
+    def add_flow(self, flow, profileid: str, twid: str, label="Benign"):
+        """
+        Just in case, by default if there are no labels in the flow, we consider it Benign
+        """
         # stores it in the db
         self.sqlite.add_flow(flow, profileid, twid, label=label)
         # handles the channels and labels etc.

From 5a7c0ded0fcf0c46666839a155556f09409687cc Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:02:51 +0000
Subject: [PATCH 201/455] Rename var name to be more clear

---
 slips_files/core/profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py
index 3c4d59db27..3dd478dcf2 100644
--- a/slips_files/core/profiler.py
+++ b/slips_files/core/profiler.py
@@ -119,7 +119,7 @@ def read_configuration(self):
         self.local_whitelist_path = conf.local_whitelist_path()
         self.timeformat = conf.ts_format()
         self.analysis_direction = conf.analysis_direction()
-        self.label = conf.label()
+        self.configuration_label = conf.label()
         self.width = conf.get_tw_width_as_float()
         self.client_ips: List[
             Union[IPv4Network, IPv6Network, IPv4Address, IPv6Address]

From 24e638bdba4dedacff0e2af93b701f3d1b75403e Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:10 +0000
Subject: [PATCH 202/455] Rename var name

---
 slips_files/core/profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py
index 3dd478dcf2..429faae5c3 100644
--- a/slips_files/core/profiler.py
+++ b/slips_files/core/profiler.py
@@ -377,7 +377,7 @@ def store_features_going_in(self, profileid: str, twid: str, flow):
             flow,
             profileid=profileid,
             twid=twid,
-            label=self.label,
+            label=self.configuration_label,
         )
         self.db.mark_profile_tw_as_modified(profileid, twid, "")
 

From f872498d1f7848c293a1c71e03b21f35b0eba1d3 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:31 +0000
Subject: [PATCH 203/455] Fix processeed flows being zero

---
 slips/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slips/main.py b/slips/main.py
index df49ffb97e..39e8b2a673 100644
--- a/slips/main.py
+++ b/slips/main.py
@@ -414,7 +414,7 @@ def get_analyzed_flows_percentage(self) -> str:
             self.total_flows = self.db.get_total_flows()
 
         flows_percentage = int(
-            (self.db.get_processed_flows_so_far() / self.total_flows) * 100
+            (self.db.get_processed_flows_so_far() / self.total_flows) * 100 if self.total_flows != 0 else 0
         )
         return f"Analyzed Flows: {green(flows_percentage)}{green('%')}. "
 

From 19c3116d79ae35e0138b623dd05d0994dcabd679 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:53 +0000
Subject: [PATCH 204/455] Delete old comments

---
 modules/flowmldetection/flowmldetection.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 766178e127..6c3bfc1275 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -28,10 +28,6 @@
     Method,
 )
 
-# Only for debbuging
-# from matplotlib import pyplot as plt
-
-
 # This horrible hack is only to stop sklearn from printing those warnings
 def warn(*args, **kwargs):
     pass

From 0d6d1da5f8494e912ceb600fcc14c93c7dd36204 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:13:22 +0000
Subject: [PATCH 205/455] Fix plots

---
 modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++-----
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 0b5b5b72ba..359df04eff 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -2,6 +2,8 @@
 import matplotlib.pyplot as plt
 import re
 import sys
+import argparse
+import os
 
 def plot_log_data(file_path):
     # Read the log data from the file
@@ -24,33 +26,59 @@ def plot_log_data(file_path):
         "Score": float
     })
 
+    # Get the directory of the log file to store the plot in the same folder
+    dir_name = os.path.dirname(file_path)
+    plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png')
+
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))
 
-    # Plotting Score on the left y-axis
+    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
     ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
     ax1.set_xlabel('Index')
     ax1.set_ylabel('Score', color='tab:blue')
+    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
     ax1.tick_params(axis='y', labelcolor='tab:blue')
 
-    # Create the second y-axis for the Total labels
+    # Create the second y-axis for the Background, Benign, Malicious, Total labels
     ax2 = ax1.twinx()
+    ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
+    ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
+    ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
     ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
-    ax2.set_ylabel('Total labels', color='tab:red')
+    ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red')
+    
+    # Set appropriate scale for right y-axis based on the data
+    ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max())
     ax2.tick_params(axis='y', labelcolor='tab:red')
 
     # Adding title and legend
     plt.title('Log Data Visualization')
     fig.tight_layout()
 
-    # Save plot to a PNG file
-    plt.savefig('log_data_plot_with_two_scales.png')
+    # Adding the legend with increased space for readability
+    ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
+    ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small')
+
+    # Increase right margin for better readability of legend
+    plt.subplots_adjust(right=0.75)
+
+    # Save plot to the same folder as the log file
+    plt.savefig(plot_file)
 
     # Display the plot
     plt.show()
 
-# Make sure the file path is passed as an argument
-if len(sys.argv) < 2:
-    print("Please provide the path to the log file as a parameter.")
-else:
-    plot_log_data(sys.argv[1])
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
+    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
+    
+    # Handle -h / --help
+    args = parser.parse_args()
+
+    # Call the function to process the log file
+    plot_log_data(args.log_file)
+
+if __name__ == "__main__":
+    main()

From da5d1875a5f4ce9ec016e5cfa8f41e31ed5862b5 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:14:58 +0000
Subject: [PATCH 206/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 359df04eff..c7f374a7fe 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -40,18 +40,21 @@ def plot_log_data(file_path):
     ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
     ax1.tick_params(axis='y', labelcolor='tab:blue')
 
-    # Create the second y-axis for the Background, Benign, Malicious, Total labels
+    # Create the second y-axis for the Background, Benign, Malicious
     ax2 = ax1.twinx()
     ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
     ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
     ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
-    ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
-    ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red')
+    ax2.set_ylabel('Background, Benign, Malicious', color='tab:red')
     
     # Set appropriate scale for right y-axis based on the data
-    ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max())
+    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
     ax2.tick_params(axis='y', labelcolor='tab:red')
 
+    # Annotating Total labels as text on the plot
+    for i, value in enumerate(df["Total labels"]):
+        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
+
     # Adding title and legend
     plt.title('Log Data Visualization')
     fig.tight_layout()

From 0f3d1f5b26d0a8c25cfdfc9b758e249fa48fface Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:16:23 +0000
Subject: [PATCH 207/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index c7f374a7fe..4099c47c1e 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -42,10 +42,10 @@ def plot_log_data(file_path):
 
     # Create the second y-axis for the Background, Benign, Malicious
     ax2 = ax1.twinx()
-    ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
-    ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
-    ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
-    ax2.set_ylabel('Background, Benign, Malicious', color='tab:red')
+    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
+    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
+    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
+    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
     
     # Set appropriate scale for right y-axis based on the data
     ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
@@ -56,7 +56,7 @@ def plot_log_data(file_path):
         ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
 
     # Adding title and legend
-    plt.title('Log Data Visualization')
+    plt.title('Training performance')
     fig.tight_layout()
 
     # Adding the legend with increased space for readability

From b000f176f8278d4fa86a2f4fb2d994da9813aaca Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:24:43 +0000
Subject: [PATCH 208/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 4099c47c1e..8437e968ac 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -59,12 +59,12 @@ def plot_log_data(file_path):
     plt.title('Training performance')
     fig.tight_layout()
 
-    # Adding the legend with increased space for readability
-    ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
-    ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small')
+    # Move both legends further to the right
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
 
     # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.75)
+    plt.subplots_adjust(right=0.7)
 
     # Save plot to the same folder as the log file
     plt.savefig(plot_file)

From bd1f21b2101ae36b11bc5e3a866de745a8c3e2e8 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:02:34 +0000
Subject: [PATCH 209/455] Plot testing performance from a log

---
 .../plot_testing_performance.py               | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 modules/flowmldetection/plot_testing_performance.py

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
new file mode 100644
index 0000000000..a38c7f0598
--- /dev/null
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -0,0 +1,89 @@
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+
+def process_file(file_path):
+    # Initialize the counters for the values
+    FPR_values = []
+    FNR_values = []
+    TNR_values = []
+    TPR_values = []
+    F1_values = []
+    accuracy_values = []
+    precision_values = []
+    MCC_values = []
+    recall_values = []
+    
+    # Read the file and extract the data
+    with open(file_path, 'r') as file:
+        for line in file:
+            if "TP:" in line:
+                # Extract the values from the line
+                parts = line.split(',')
+                TP = int(parts[0].split(':')[1].strip())
+                TN = int(parts[1].split(':')[1].strip())
+                FP = int(parts[2].split(':')[1].strip())
+                FN = int(parts[3].split(':')[1].strip())
+
+                # Calculate metrics
+                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
+                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
+                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
+                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
+                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
+                Recall = TPR  # Recall is the same as TPR
+                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
+                Accuracy = (TP + TN) / (TP + TN + FP + FN)
+                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
+                
+                # Append the values to the respective lists
+                FPR_values.append(FPR)
+                FNR_values.append(FNR)
+                TNR_values.append(TNR)
+                TPR_values.append(TPR)
+                F1_values.append(F1)
+                accuracy_values.append(Accuracy)
+                precision_values.append(Precision)
+                MCC_values.append(MCC)
+                recall_values.append(Recall)
+    
+    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
+
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+    # Create the plot
+    plt.figure(figsize=(12, 8))
+    
+    # Plot each metric
+    plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o')
+    plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o')
+    plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o')
+    plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o')
+    plt.plot(F1_values, label='F1 Score', marker='o')
+    plt.plot(accuracy_values, label='Accuracy', marker='o')
+    plt.plot(precision_values, label='Precision', marker='o')
+    plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
+    plt.plot(recall_values, label='Recall (TPR)', marker='o')
+    
+    # Add labels and title
+    plt.xlabel('Index')
+    plt.ylabel('Metric Value')
+    plt.title('Evaluation Metrics Over Time')
+    
+    # Add a legend
+    plt.legend()
+    
+    # Save the plot as a PNG file
+    plt.savefig('metrics_plot.png')
+    plt.close()
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <file_path>")
+        sys.exit(1)
+    
+    file_path = sys.argv[1]
+    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+
+if __name__ == "__main__":
+    main()

From fd21630441d02796cd0aae52b5e13492a2d731d0 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:04:32 +0000
Subject: [PATCH 210/455] Fix the plot

---
 modules/flowmldetection/plot_testing_performance.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index a38c7f0598..fac0acd64a 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
     plt.plot(recall_values, label='Recall (TPR)', marker='o')
     
+    # Set logarithmic scale on the y-axis
+    plt.yscale('log')
+    
     # Add labels and title
     plt.xlabel('Index')
-    plt.ylabel('Metric Value')
-    plt.title('Evaluation Metrics Over Time')
+    plt.ylabel('Metric Value (Log Scale)')
+    plt.title('Evaluation Metrics Over Time (Log Scale)')
     
     # Add a legend
     plt.legend()
     
     # Save the plot as a PNG file
-    plt.savefig('metrics_plot.png')
+    plt.savefig('metrics_plot_log_scale.png')
     plt.close()
 
 def main():

From ee0deaf2a3229c26a5c734a314878b9b0a393c01 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:12:40 +0000
Subject: [PATCH 211/455] Fix the plots

---
 .../plot_testing_performance.py               | 76 ++++++++++++++-----
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index fac0acd64a..5581c72cd4 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -50,33 +50,66 @@ def process_file(file_path):
     return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
 
 def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
-    # Create the plot
-    plt.figure(figsize=(12, 8))
+    # Separate the values into two groups based on their proximity to 0 or 1
+    close_to_0 = {
+        'FPR': [], 'FNR': []
+    }
+    close_to_1 = {
+        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
+    }
     
-    # Plot each metric
-    plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o')
-    plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o')
-    plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o')
-    plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o')
-    plt.plot(F1_values, label='F1 Score', marker='o')
-    plt.plot(accuracy_values, label='Accuracy', marker='o')
-    plt.plot(precision_values, label='Precision', marker='o')
-    plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
-    plt.plot(recall_values, label='Recall (TPR)', marker='o')
+    # Categorize the metrics into two groups
+    for i in range(len(FPR_values)):
+        close_to_0['FPR'].append(FPR_values[i])
+        close_to_0['FNR'].append(FNR_values[i])
+        
+        close_to_1['TNR'].append(TNR_values[i])
+        close_to_1['TPR'].append(TPR_values[i])
+        close_to_1['F1'].append(F1_values[i])
+        close_to_1['accuracy'].append(accuracy_values[i])
+        close_to_1['precision'].append(precision_values[i])
+        close_to_1['MCC'].append(MCC_values[i])
+        close_to_1['recall'].append(recall_values[i])
+
+    # Plot metrics for values close to 0
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png')
     
-    # Set logarithmic scale on the y-axis
-    plt.yscale('log')
+    # Plot metrics for values close to 1
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+
+def plot_single_group(metrics_dict, output_filename):
+    plt.figure(figsize=(12, 8))
     
-    # Add labels and title
+    # Only plot the metrics that exist in the dictionary
+    if 'FPR' in metrics_dict:
+        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
+    if 'FNR' in metrics_dict:
+        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
+    if 'TNR' in metrics_dict:
+        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
+    if 'TPR' in metrics_dict:
+        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
+    if 'F1' in metrics_dict:
+        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
+    if 'accuracy' in metrics_dict:
+        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
+    if 'precision' in metrics_dict:
+        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
+    if 'MCC' in metrics_dict:
+        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
+    if 'recall' in metrics_dict:
+        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
+
+    # Apply log scale by default
+    plt.yscale('log')
+
     plt.xlabel('Index')
-    plt.ylabel('Metric Value (Log Scale)')
-    plt.title('Evaluation Metrics Over Time (Log Scale)')
-    
-    # Add a legend
+    plt.ylabel('Metric Value')
+    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
     plt.legend()
     
-    # Save the plot as a PNG file
-    plt.savefig('metrics_plot_log_scale.png')
+    # Save the plot
+    plt.savefig(output_filename)
     plt.close()
 
 def main():
@@ -85,6 +118,7 @@ def main():
         sys.exit(1)
     
     file_path = sys.argv[1]
+    
     FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
     plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
 

From f9d8806d2c2035b3cb57e69a70b462cec05e5f57 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:16:50 +0000
Subject: [PATCH 212/455] Fix plot

---
 .../plot_testing_performance.py               | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 5581c72cd4..8f9e12cd86 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png')
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1
     plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
 
-def plot_single_group(metrics_dict, output_filename):
+    # Print the final values
+    print("\nFinal Metric Values:")
+    print(f"Final FPR: {FPR_values[-1]:.4f}")
+    print(f"Final FNR: {FNR_values[-1]:.4f}")
+    print(f"Final TNR: {TNR_values[-1]:.4f}")
+    print(f"Final TPR: {TPR_values[-1]:.4f}")
+    print(f"Final F1 Score: {F1_values[-1]:.4f}")
+    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
+    print(f"Final Precision: {precision_values[-1]:.4f}")
+    print(f"Final MCC: {MCC_values[-1]:.4f}")
+    print(f"Final Recall: {recall_values[-1]:.4f}")
+
+def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     plt.figure(figsize=(12, 8))
     
     # Only plot the metrics that exist in the dictionary
@@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename):
     # Apply log scale by default
     plt.yscale('log')
 
+    # If the plot is close to 0, set custom ticks
+    if is_close_to_0:
+        # Manually set more Y-ticks for better visibility
+        plt.ylim(0.0001, 1)  # Set Y-axis limits between 0.0001 and 1
+        plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1'])  # Adjust Y-ticks
+
     plt.xlabel('Index')
     plt.ylabel('Metric Value')
     plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')

From 15e37d2d67dc27f0aaabb5cb40dbc3fe397d64ec Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:20:22 +0000
Subject: [PATCH 213/455] Fix plots

---
 modules/flowmldetection/flowmldetection.py | 709 +++++----------------
 1 file changed, 143 insertions(+), 566 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6c3bfc1275..37f0761109 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,566 +1,143 @@
-# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
-from typing import Optional
-
-# SPDX-License-Identifier: GPL-2.0-only
-import numpy
-from sklearn.linear_model import SGDClassifier
-from sklearn.preprocessing import StandardScaler
-import pickle
-import pandas as pd
-import json
-import traceback
-import warnings
-import os
-
-from slips_files.common.parsers.config_parser import ConfigParser
-from slips_files.common.slips_utils import utils
-from slips_files.common.abstracts.module import IModule
-from slips_files.core.structures.evidence import (
-    Evidence,
-    ProfileID,
-    TimeWindow,
-    Attacker,
-    ThreatLevel,
-    EvidenceType,
-    IoCType,
-    Direction,
-    Victim,
-    Method,
-)
-
-# This horrible hack is only to stop sklearn from printing those warnings
-def warn(*args, **kwargs):
-    pass
-
-
-warnings.warn = warn
-
-
-class FlowMLDetection(IModule):
-    # Name: short name of the module. Do not use spaces
-    name = "Flow ML Detection"
-    description = (
-        "Train or test a Machine Learning model to detect malicious flows"
-    )
-    authors = ["Sebastian Garcia"]
-
-    def init(self):
-        # Subscribe to the channel
-        self.c1 = self.db.subscribe("new_flow")
-        self.channels = {"new_flow": self.c1}
-        self.fieldseparator = self.db.get_field_separator()
-        # Set the output queue of our database instance
-        # Read the configuration
-        self.read_configuration()
-        # Minum amount of new labels needed to start the train
-        self.minimum_labels_to_start_train = 50
-        # Minum amount of new labels needed to retrain
-        self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained. Used internally only to know
-        # when to retrain
-        self.last_number_of_flows_when_trained = 0
-        # The scaler trained during training and to use during testing
-        self.scaler = StandardScaler()
-        self.model_path = "./modules/flowmldetection/model.bin"
-        self.scaler_path = "./modules/flowmldetection/scaler.bin"
-
-        # Initialize the training log file
-        self.training_log_path = "./modules/flowmldetection/training.log"
-        with open(self.training_log_path, "w") as log_file:
-            log_file.write("Training Log Initialized\n")
-
-    def read_configuration(self):
-        conf = ConfigParser()
-        self.mode = conf.get_ml_mode()
-        # This is the global label in the configuration,
-        # in case the flows do not have a label themselves
-        self.label = conf.label()
-
-    def write_to_training_log(self, message: str):
-        """
-        Write a message to the training log file.
-        """
-        try:
-            with open(self.training_log_path, "a") as log_file:
-                log_file.write(message + "\n")
-        except Exception as e:
-            self.print(f"Error writing to training log: {e}", 0, 1)
-
-    def train(self, sum_labeled_flows):
-        """
-        Train a model based on the flows we receive and the labels
-        """
-        try:
-            # Create X_flow with the current flows minus the label
-            X_flow = self.flows.drop("ground_truth_label", axis=1)
-            # Drop the detailed labels
-            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
-            # Drop the module_labels
-            X_flow = X_flow.drop("module_labels", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
-
-            # Normalize this batch of data so far. This can get progressivle slow
-            X_flow = self.scaler.fit_transform(X_flow)
-
-            # Count the number of labels of each type in this epoc
-            epoch_label_counts = {
-                "Background": (y_flow == "Background").sum(),
-                "Malicious": (y_flow == "Malicious").sum(),
-                "Benign": (y_flow == "Benign").sum(),
-            }
-
-            # Train
-            try:
-                # Online incremental learning
-                self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
-                )
-            except Exception:
-                self.print("Error while calling clf.train()")
-                self.print(traceback.format_exc(), 0, 1)
-
-            # See score so far in training
-            score = self.clf.score(X_flow, y_flow)
-
-            #self.print(f"	Training Score: {score}", 1, 0)
-            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
-
-            # Store the models on disk
-            self.store_model()
-
-            # Log training information
-            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
-            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
-        except Exception:
-            self.print("Error in train().", 0, 1)
-            self.print(traceback.format_exc(), 0, 1)
-            self.write_to_training_log("Error occurred during training.")
-
-    def process_features(self, dataset):
-        """
-        Discards some features of the dataset and can create new.
-        Clean the dataset
-        """
-        try:
-            # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
-            for proto in to_discard:
-                dataset = dataset[dataset.proto != proto]
-
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
-            if dataset.empty:
-                # DataFrame is empty now, so return empty
-                return dataset
-
-            # For now, discard these
-            to_drop = [
-                "appproto",
-                "daddr",
-                "saddr",
-                "starttime",
-                "type_",
-                "smac",
-                "dmac",
-                "history",
-                "uid",
-                "dir_",
-                "endtime",
-                "flow_source",
-            ]
-            for field in to_drop:
-                try:
-                    dataset = dataset.drop(field, axis=1)
-                except (ValueError, KeyError):
-                    pass
-
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
-            # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: self.db.get_final_state_from_flags(
-                    row["state"], (row["spkts"] + row["dpkts"])
-                ),
-                axis=1,
-            )
-
-            # Convert state to categorical
-            dataset.state = dataset.state.str.replace(
-                r"(^.*Not Established.*$)", "0", regex=True
-            )
-            dataset.state = dataset.state.str.replace(
-                r"(^.*Established.*$)", "1", regex=True
-            )
-
-            # Convert categories to floats
-            dataset.state = dataset.state.astype("float64")
-
-            # Convert proto to categorical. For now we only have few states, so we can hardcode...
-            # We dont use the data to create categories because in testing mode
-            # we dont see all the protocols
-            # Also we dont store the Categorizer because the user can retrain
-            # with its own data.
-            dataset.proto = dataset.proto.str.lower()
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*tcp.*$)", "0", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*udp.*$)", "1", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*icmp.*$)", "2", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*icmp-ipv6.*$)", "3", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*arp.*$)", "4", regex=True
-            )
-
-            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
-            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
-
-            fields_to_convert_to_float = [
-                dataset.proto,
-                dataset.dport,
-                dataset.sport,
-                dataset.dur,
-                dataset.pkts,
-                dataset.spkts,
-                dataset.allbytes,
-                dataset.sbytes,
-                dataset.state,
-            ]
-            for field in fields_to_convert_to_float:
-                try:
-                    field = field.astype("float64")
-                except (ValueError, AttributeError):
-                    pass
-
-            return dataset
-        except Exception:
-            # Stop the timer
-            self.print("Error in process_features()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def process_training_flows(self, last_number_of_flows_when_trained):
-        """
-        Process only the new flows in the DB since the last training.
-        Store the pandas df in self.flows
-        """
-        try:
-            # Ensure the index is an integer
-            if last_number_of_flows_when_trained is None:
-                last_number_of_flows_when_trained = 0
-            else:
-                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
-
-            # We get all the flows so far
-            flows = self.db.get_all_flows()
-            # Only process new flows since last training
-            new_flows = flows[last_number_of_flows_when_trained:]
-
-            # Check how many **different** labels are in the DB
-            labels = self.db.get_labels()
-            if len(labels) == 1:
-                # Insert fake flows for both classes if needed
-                new_flows.append(
-                    {
-                        "starttime": 1594417039.029793,
-                        "dur": "1.9424750804901123",
-                        "saddr": "10.7.10.101",
-                        "sport": "49733",
-                        "daddr": "40.70.224.145",
-                        "dport": "443",
-                        "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 17,
-                        "dpkts": 27,
-                        "sbytes": 25517,
-                        "dbytes": 17247,
-                        "appproto": "ssl",
-                        "ground_truth_label": "Malicious",
-                        "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
-                        },
-                    }
-                )
-                new_flows.append(
-                    {
-                        "starttime": 1382355032.706468,
-                        "dur": "10.896695",
-                        "saddr": "147.32.83.52",
-                        "sport": "47956",
-                        "daddr": "80.242.138.72",
-                        "dport": "80",
-                        "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 1,
-                        "dpkts": 0,
-                        "sbytes": 100,
-                        "dbytes": 67596,
-                        "appproto": "http",
-                        "ground_truth_label": "Benign",
-                        "module_labels": {
-                            "flowalerts-long-connection": "Benign"
-                        },
-                    }
-                )
-
-            # Convert to pandas df
-            df_flows = pd.DataFrame(new_flows)
-
-            # Process features
-            df_flows = self.process_features(df_flows)
-
-            # Update the flow to the processed version
-            self.flows = df_flows
-        except Exception:
-            self.print("Error in process_flows()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def process_flow(self, flow_to_process: dict):
-        """
-        Process one flow. Only used during detection in testing
-        returns the pandas df with the processed flow
-        """
-        try:
-            # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(flow_to_process, index=[0])
-            dflow = self.process_features(raw_flow)
-            if dflow.empty:
-                return None
-            # Update the flow to the processed version
-            return dflow
-        except Exception:
-            # Stop the timer
-            self.print("Error in process_flow()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def detect(self, x_flow) -> Optional[numpy.ndarray]:
-        """
-        Detects the given flow with the current model stored
-        and returns the predection array
-        """
-        try:
-            # clean the flow
-            fields_to_drop = [
-                "label",
-                "module_labels",
-                "uid",
-                "history",
-                "dir_",
-                "endtime",
-                "flow_source",
-                "ground_truth_label",
-                "detailed_ground_truth_label",
-            ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
-            # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
-
-            # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
-            for field in fields_to_drop:
-                try:
-                    x_flow = x_flow.drop(field, axis=1)
-                except (KeyError, ValueError):
-                    pass
-            # Scale the flow
-            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
-            pred: numpy.ndarray = self.clf.predict(x_flow)
-            return pred
-        except Exception as e:
-            self.print(
-                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
-            )
-            self.print(traceback.format_exc(), 0, 1)
-
-    def store_model(self):
-        """
-        Store the trained model on disk
-        """
-        self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open(self.model_path, "wb") as f:
-            data = pickle.dumps(self.clf)
-            f.write(data)
-        with open(self.scaler_path, "wb") as g:
-            data = pickle.dumps(self.scaler)
-            g.write(data)
-
-    def read_model(self):
-        """
-        Read the trained model from disk
-        """
-        try:
-            self.print("Reading the trained model from disk.", 0, 2)
-            with open(self.model_path, "rb") as f:
-                self.clf = pickle.load(f)
-            self.print("Reading the trained scaler from disk.", 0, 2)
-            with open(self.scaler_path, "rb") as g:
-                self.scaler = pickle.load(g)
-        except FileNotFoundError:
-            # If there is no model, create one empty
-            self.print(
-                "There was no model. " "Creating a new empty model.", 0, 2
-            )
-            self.clf = SGDClassifier(
-                warm_start=True, loss="hinge", penalty="l1"
-            )
-        except EOFError:
-            self.print(
-                "Error reading model from disk. "
-                "Creating a new empty model.",
-                0,
-                2,
-            )
-            self.clf = SGDClassifier(
-                warm_start=True, loss="hinge", penalty="l1"
-            )
-
-    def set_evidence_malicious_flow(self, flow: dict, twid: str):
-        confidence: float = 0.1
-        description = (
-            f"Flow with malicious characteristics by ML. Src IP"
-            f" {flow['saddr']}:{flow['sport']} to "
-            f"{flow['daddr']}:{flow['dport']}"
-        )
-        twid_number = int(twid.replace("timewindow", ""))
-        evidence: Evidence = Evidence(
-            evidence_type=EvidenceType.MALICIOUS_FLOW,
-            attacker=Attacker(
-                direction=Direction.SRC,
-                ioc_type=IoCType.IP,
-                value=flow["saddr"],
-            ),
-            victim=Victim(
-                direction=Direction.DST,
-                ioc_type=IoCType.IP,
-                value=flow["daddr"],
-            ),
-            threat_level=ThreatLevel.LOW,
-            confidence=confidence,
-            description=description,
-            profile=ProfileID(ip=flow["saddr"]),
-            timewindow=TimeWindow(twid_number),
-            uid=[flow["uid"]],
-            timestamp=flow["starttime"],
-            method=Method.AI,
-            src_port=flow["sport"],
-            dst_port=flow["dport"],
-        )
-
-        self.db.set_evidence(evidence)
-
-    def shutdown_gracefully(self):
-        # Confirm that the module is done processing
-        if self.mode == "train":
-            self.store_model()
-
-    def pre_main(self):
-        utils.drop_root_privs()
-        # Load the model
-        self.read_model()
-
-    def main(self):
-        if msg := self.get_msg("new_flow"):
-            # When a new flow arrives
-            msg = json.loads(msg["data"])
-            self.twid = msg["twid"]
-            self.profileid = msg["profileid"]
-            self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
-            # flow dict to have them
-            self.flow.update(
-                {
-                    "state": msg["interpreted_state"],
-                    "label": msg["label"],
-                    "module_labels": msg["module_labels"],
-                }
-            )
-
-            if self.mode == "train":
-                # We are training
-
-                # Is the amount in the DB of labels enough to retrain?
-                # Use labeled flows
-                labels = self.db.get_labels()
-                sum_labeled_flows = sum(i[1] for i in labels)
-
-                # The min labels to retrain is the min number of flows 
-                # we should have seen so far in this capture to start training
-                # This is so we dont _start_ training with only 1 flow
-
-                # Once we are over the start minimum, the second condition is 
-                # to force to retrain every a minimum_labels_to_retrain number
-                # of flows. So we dont retrain every 1 flow.
-                if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
-                ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
-                        # So for example we retrain every 50 labels and only when
-                        # we have at least 50 labels
-                        self.print(
-                            f"Training the model with the last group of "
-                            f"flows and labels. Total flows: {sum_labeled_flows}."
-                        )
-                        # Process all flows in the DB and make them ready
-                        # for pandas
-                        self.process_training_flows(self.last_number_of_flows_when_trained)
-                        # Train an algorithm
-                        self.train(sum_labeled_flows)
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
-
-            elif self.mode == "test":
-                # We are testing, which means using the model to detect
-                processed_flow = self.process_flow(self.flow)
-
-                # After processing the flow, it may happen that we
-                # delete icmp/arp/etc so the dataframe can be empty
-                if processed_flow is not None and not processed_flow.empty:
-                    # Predict
-                    pred: numpy.ndarray = self.detect(processed_flow)
-                    if not pred:
-                        # an error occurred
-                        return
-
-                    label = self.flow["label"]
-                    if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
-                        self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            3,
-                        )
-                    if pred[0] == "Malicious":
-                        # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, self.twid)
-                        self.print(
-                            f"Prediction {pred[0]} for label {label}"
-                            f' flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} -> '
-                            f'{self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            2,
-                        )
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+
+def process_file(file_path):
+    # Initialize the counters for the values
+    FPR_values = []
+    FNR_values = []
+    TNR_values = []
+    TPR_values = []
+    F1_values = []
+    accuracy_values = []
+    precision_values = []
+    MCC_values = []
+    recall_values = []
+    
+    # Read the file and extract the data
+    with open(file_path, 'r') as file:
+        for line in file:
+            if "TP:" in line:
+                # Extract the values from the line
+                parts = line.split(',')
+                TP = int(parts[0].split(':')[1].strip())
+                TN = int(parts[1].split(':')[1].strip())
+                FP = int(parts[2].split(':')[1].strip())
+                FN = int(parts[3].split(':')[1].strip())
+
+                # Calculate metrics
+                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
+                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
+                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
+                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
+                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
+                Recall = TPR  # Recall is the same as TPR
+                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
+                Accuracy = (TP + TN) / (TP + TN + FP + FN)
+                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
+                
+                # Append the values to the respective lists
+                FPR_values.append(FPR)
+                FNR_values.append(FNR)
+                TNR_values.append(TNR)
+                TPR_values.append(TPR)
+                F1_values.append(F1)
+                accuracy_values.append(Accuracy)
+                precision_values.append(Precision)
+                MCC_values.append(MCC)
+                recall_values.append(Recall)
+    
+    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
+
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+    # Separate the values into two groups based on their proximity to 0 or 1
+    close_to_0 = {
+        'FPR': [], 'FNR': []
+    }
+    close_to_1 = {
+        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
+    }
+    
+    # Categorize the metrics into two groups
+    for i in range(len(FPR_values)):
+        close_to_0['FPR'].append(FPR_values[i])
+        close_to_0['FNR'].append(FNR_values[i])
+        
+        close_to_1['TNR'].append(TNR_values[i])
+        close_to_1['TPR'].append(TPR_values[i])
+        close_to_1['F1'].append(F1_values[i])
+        close_to_1['accuracy'].append(accuracy_values[i])
+        close_to_1['precision'].append(precision_values[i])
+        close_to_1['MCC'].append(MCC_values[i])
+        close_to_1['recall'].append(recall_values[i])
+
+    # Plot metrics for values close to 0
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
+    
+    # Plot metrics for values close to 1
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+
+    # Print the final values
+    print("\nFinal Metric Values:")
+    print(f"Final FPR: {FPR_values[-1]:.4f}")
+    print(f"Final FNR: {FNR_values[-1]:.4f}")
+    print(f"Final TNR: {TNR_values[-1]:.4f}")
+    print(f"Final TPR: {TPR_values[-1]:.4f}")
+    print(f"Final F1 Score: {F1_values[-1]:.4f}")
+    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
+    print(f"Final Precision: {precision_values[-1]:.4f}")
+    print(f"Final MCC: {MCC_values[-1]:.4f}")
+    print(f"Final Recall: {recall_values[-1]:.4f}")
+
+def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
+    plt.figure(figsize=(12, 8))
+    
+    # Only plot the metrics that exist in the dictionary
+    if 'FPR' in metrics_dict:
+        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
+    if 'FNR' in metrics_dict:
+        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
+    if 'TNR' in metrics_dict:
+        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
+    if 'TPR' in metrics_dict:
+        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
+    if 'F1' in metrics_dict:
+        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
+    if 'accuracy' in metrics_dict:
+        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
+    if 'precision' in metrics_dict:
+        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
+    if 'MCC' in metrics_dict:
+        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
+    if 'recall' in metrics_dict:
+        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
+
+    # Apply log scale by default
+    plt.yscale('log')
+
+    # If the plot is close to 0, set custom ticks
+    if is_close_to_0:
+        # Add more ticks between 0 and 1 (using a logarithmic scale)
+        plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100'])
+
+    plt.xlabel('Index')
+    plt.ylabel('Metric Value')
+    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
+    plt.legend()
+    
+    # Save the plot
+    plt.savefig(output_filename)
+    plt.close()
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <file_path>")
+        sys.exit(1)
+    
+    file_path = sys.argv[1]
+    
+    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+
+if __name__ == "__main__":
+    main()

From 9ddaf31f83a34962af33188b0f88176dc8ec33fd Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:20:52 +0000
Subject: [PATCH 214/455] Fix plots

---
 .../plot_testing_performance.py               | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 8f9e12cd86..69b8c96a8c 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['MCC'].append(MCC_values[i])
         close_to_1['recall'].append(recall_values[i])
 
-    # Plot metrics for values close to 0
+    # Plot metrics for values close to 0 (linear scale)
     plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
     
-    # Plot metrics for values close to 1
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+    # Plot metrics for values close to 1 (log scale)
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")
@@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     if 'recall' in metrics_dict:
         plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
 
-    # Apply log scale by default
-    plt.yscale('log')
+    # If the plot is close to 1, apply log scale
+    if not is_close_to_0:
+        plt.yscale('log')
 
-    # If the plot is close to 0, set custom ticks
+    # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series
     if is_close_to_0:
-        # Manually set more Y-ticks for better visibility
-        plt.ylim(0.0001, 1)  # Set Y-axis limits between 0.0001 and 1
-        plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1'])  # Adjust Y-ticks
+        min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR']))
+        max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR']))
+        
+        # Avoid log(0), so set the minimum limit a little higher than zero
+        if min_val == 0:
+            min_val = 1e-4  # Avoid zero values on the logarithmic scale
+
+        plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From 878812adb8ffbdb24c82525a2b45580dd2aad4d5 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:23:02 +0000
Subject: [PATCH 215/455] Fix plots

---
 modules/flowmldetection/plot_testing_performance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 69b8c96a8c..de4ada38b3 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
         
         # Avoid log(0), so set the minimum limit a little higher than zero
         if min_val == 0:
-            min_val = 1e-4  # Avoid zero values on the logarithmic scale
+            min_val = 1e-8  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From b1909a50ed00fe86cebd6b037556ee7f5a419403 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:25:58 +0000
Subject: [PATCH 216/455] Change plot names

---
 modules/flowmldetection/plot_testing_performance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index de4ada38b3..1b4152c6eb 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0 (linear scale)
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
+    plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")

From 213b6a5b6597b8b568ee45755d44b5e334c668b7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:26:09 +0000
Subject: [PATCH 217/455] Rename file

---
 .../{plot_train_score.py => plot_train_performance.py}          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename modules/flowmldetection/{plot_train_score.py => plot_train_performance.py} (97%)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_performance.py
similarity index 97%
rename from modules/flowmldetection/plot_train_score.py
rename to modules/flowmldetection/plot_train_performance.py
index 8437e968ac..80e13e9515 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -28,7 +28,7 @@ def plot_log_data(file_path):
 
     # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png')
+    plot_file = os.path.join(dir_name, 'performance_metrics_training.png')
 
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))

From 20db5dbd1db02d06af5a6a9d7b6bb27e0e40a66f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:31:32 +0000
Subject: [PATCH 218/455] Recover good flowmldetection deleted by mistake

---
 modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++-----
 1 file changed, 566 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 37f0761109..5e4e9aa462 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,143 +1,566 @@
-import matplotlib.pyplot as plt
-import sys
-import numpy as np
-
-def process_file(file_path):
-    # Initialize the counters for the values
-    FPR_values = []
-    FNR_values = []
-    TNR_values = []
-    TPR_values = []
-    F1_values = []
-    accuracy_values = []
-    precision_values = []
-    MCC_values = []
-    recall_values = []
-    
-    # Read the file and extract the data
-    with open(file_path, 'r') as file:
-        for line in file:
-            if "TP:" in line:
-                # Extract the values from the line
-                parts = line.split(',')
-                TP = int(parts[0].split(':')[1].strip())
-                TN = int(parts[1].split(':')[1].strip())
-                FP = int(parts[2].split(':')[1].strip())
-                FN = int(parts[3].split(':')[1].strip())
-
-                # Calculate metrics
-                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
-                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
-                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
-                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
-                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
-                Recall = TPR  # Recall is the same as TPR
-                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
-                Accuracy = (TP + TN) / (TP + TN + FP + FN)
-                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
-                
-                # Append the values to the respective lists
-                FPR_values.append(FPR)
-                FNR_values.append(FNR)
-                TNR_values.append(TNR)
-                TPR_values.append(TPR)
-                F1_values.append(F1)
-                accuracy_values.append(Accuracy)
-                precision_values.append(Precision)
-                MCC_values.append(MCC)
-                recall_values.append(Recall)
-    
-    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
-
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
-    # Separate the values into two groups based on their proximity to 0 or 1
-    close_to_0 = {
-        'FPR': [], 'FNR': []
-    }
-    close_to_1 = {
-        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
-    }
-    
-    # Categorize the metrics into two groups
-    for i in range(len(FPR_values)):
-        close_to_0['FPR'].append(FPR_values[i])
-        close_to_0['FNR'].append(FNR_values[i])
-        
-        close_to_1['TNR'].append(TNR_values[i])
-        close_to_1['TPR'].append(TPR_values[i])
-        close_to_1['F1'].append(F1_values[i])
-        close_to_1['accuracy'].append(accuracy_values[i])
-        close_to_1['precision'].append(precision_values[i])
-        close_to_1['MCC'].append(MCC_values[i])
-        close_to_1['recall'].append(recall_values[i])
-
-    # Plot metrics for values close to 0
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
-    
-    # Plot metrics for values close to 1
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
-
-    # Print the final values
-    print("\nFinal Metric Values:")
-    print(f"Final FPR: {FPR_values[-1]:.4f}")
-    print(f"Final FNR: {FNR_values[-1]:.4f}")
-    print(f"Final TNR: {TNR_values[-1]:.4f}")
-    print(f"Final TPR: {TPR_values[-1]:.4f}")
-    print(f"Final F1 Score: {F1_values[-1]:.4f}")
-    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
-    print(f"Final Precision: {precision_values[-1]:.4f}")
-    print(f"Final MCC: {MCC_values[-1]:.4f}")
-    print(f"Final Recall: {recall_values[-1]:.4f}")
-
-def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
-    plt.figure(figsize=(12, 8))
-    
-    # Only plot the metrics that exist in the dictionary
-    if 'FPR' in metrics_dict:
-        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
-    if 'FNR' in metrics_dict:
-        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
-    if 'TNR' in metrics_dict:
-        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
-    if 'TPR' in metrics_dict:
-        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
-    if 'F1' in metrics_dict:
-        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
-    if 'accuracy' in metrics_dict:
-        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
-    if 'precision' in metrics_dict:
-        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
-    if 'MCC' in metrics_dict:
-        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
-    if 'recall' in metrics_dict:
-        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
-
-    # Apply log scale by default
-    plt.yscale('log')
-
-    # If the plot is close to 0, set custom ticks
-    if is_close_to_0:
-        # Add more ticks between 0 and 1 (using a logarithmic scale)
-        plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100'])
-
-    plt.xlabel('Index')
-    plt.ylabel('Metric Value')
-    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
-    plt.legend()
-    
-    # Save the plot
-    plt.savefig(output_filename)
-    plt.close()
-
-def main():
-    if len(sys.argv) != 2:
-        print("Usage: python script.py <file_path>")
-        sys.exit(1)
-    
-    file_path = sys.argv[1]
-    
-    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
-
-if __name__ == "__main__":
-    main()
+# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
+from typing import Optional
+
+# SPDX-License-Identifier: GPL-2.0-only
+import numpy
+from sklearn.linear_model import SGDClassifier
+from sklearn.preprocessing import StandardScaler
+import pickle
+import pandas as pd
+import json
+import traceback
+import warnings
+import os
+
+from slips_files.common.parsers.config_parser import ConfigParser
+from slips_files.common.slips_utils import utils
+from slips_files.common.abstracts.module import IModule
+from slips_files.core.structures.evidence import (
+    Evidence,
+    ProfileID,
+    TimeWindow,
+    Attacker,
+    ThreatLevel,
+    EvidenceType,
+    IoCType,
+    Direction,
+    Victim,
+    Method,
+)
+
+# This horrible hack is only to stop sklearn from printing those warnings
+def warn(*args, **kwargs):
+    pass
+
+
+warnings.warn = warn
+
+
+class FlowMLDetection(IModule):
+    # Name: short name of the module. Do not use spaces
+    name = "Flow ML Detection"
+    description = (
+        "Train or test a Machine Learning model to detect malicious flows"
+    )
+    authors = ["Sebastian Garcia"]
+
+    def init(self):
+        # Subscribe to the channel
+        self.c1 = self.db.subscribe("new_flow")
+        self.channels = {"new_flow": self.c1}
+        self.fieldseparator = self.db.get_field_separator()
+        # Set the output queue of our database instance
+        # Read the configuration
+        self.read_configuration()
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained. Used internally only to know
+        # when to retrain
+        self.last_number_of_flows_when_trained = 0
+        # The scaler trained during training and to use during testing
+        self.scaler = StandardScaler()
+        self.model_path = "./modules/flowmldetection/model.bin"
+        self.scaler_path = "./modules/flowmldetection/scaler.bin"
+
+        # Initialize the training log file
+        self.training_log_path = "./modules/flowmldetection/training.log"
+        with open(self.training_log_path, "w") as log_file:
+            log_file.write("Training Log Initialized\n")
+
+    def read_configuration(self):
+        conf = ConfigParser()
+        self.mode = conf.get_ml_mode()
+        # This is the global label in the configuration,
+        # in case the flows do not have a label themselves
+        self.label = conf.label()
+
+    def write_to_training_log(self, message: str):
+        """
+        Write a message to the training log file.
+        """
+        try:
+            with open(self.training_log_path, "a") as log_file:
+                log_file.write(message + "\n")
+        except Exception as e:
+            self.print(f"Error writing to training log: {e}", 0, 1)
+
+    def train(self, sum_labeled_flows):
+        """
+        Train a model based on the flows we receive and the labels
+        """
+        try:
+            # Create X_flow with the current flows minus the label
+            X_flow = self.flows.drop("ground_truth_label", axis=1)
+            # Drop the detailed labels
+            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
+            # Drop the module_labels
+            X_flow = X_flow.drop("module_labels", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
+
+            # Normalize this batch of data so far. This can get progressivle slow
+            X_flow = self.scaler.fit_transform(X_flow)
+
+            # Count the number of labels of each type in this epoc
+            epoch_label_counts = {
+                "Background": (y_flow == "Background").sum(),
+                "Malicious": (y_flow == "Malicious").sum(),
+                "Benign": (y_flow == "Benign").sum(),
+            }
+
+            # Train
+            try:
+                # Online incremental learning
+                self.clf.partial_fit(
+                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
+                )
+            except Exception:
+                self.print("Error while calling clf.train()")
+                self.print(traceback.format_exc(), 0, 1)
+
+            # See score so far in training
+            score = self.clf.score(X_flow, y_flow)
+
+            #self.print(f"	Training Score: {score}", 1, 0)
+            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
+
+            # Store the models on disk
+            self.store_model()
+
+            # Log training information
+            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
+            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
+        except Exception:
+            self.print("Error in train().", 0, 1)
+            self.print(traceback.format_exc(), 0, 1)
+            self.write_to_training_log("Error occurred during training.")
+
+    def process_features(self, dataset):
+        """
+        Discards some features of the dataset and can create new.
+        Clean the dataset
+        """
+        try:
+            # Discard some type of flows that dont have ports
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
+            for proto in to_discard:
+                dataset = dataset[dataset.proto != proto]
+
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
+            # For now, discard these
+            to_drop = [
+                "appproto",
+                "daddr",
+                "saddr",
+                "starttime",
+                "type_",
+                "smac",
+                "dmac",
+                "history",
+                "uid",
+                "dir_",
+                "endtime",
+                "flow_source",
+            ]
+            for field in to_drop:
+                try:
+                    dataset = dataset.drop(field, axis=1)
+                except (ValueError, KeyError):
+                    pass
+
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
+            # So transform here
+            dataset["state"] = dataset.apply(
+                lambda row: self.db.get_final_state_from_flags(
+                    row["state"], (row["spkts"] + row["dpkts"])
+                ),
+                axis=1,
+            )
+
+            # Convert state to categorical
+            dataset.state = dataset.state.str.replace(
+                r"(^.*Not Established.*$)", "0", regex=True
+            )
+            dataset.state = dataset.state.str.replace(
+                r"(^.*Established.*$)", "1", regex=True
+            )
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
+            # We dont use the data to create categories because in testing mode
+            # we dont see all the protocols
+            # Also we dont store the Categorizer because the user can retrain
+            # with its own data.
+            dataset.proto = dataset.proto.str.lower()
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*tcp.*$)", "0", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*udp.*$)", "1", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*icmp.*$)", "2", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*icmp-ipv6.*$)", "3", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*arp.*$)", "4", regex=True
+            )
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
+                dataset.proto,
+                dataset.dport,
+                dataset.sport,
+                dataset.dur,
+                dataset.pkts,
+                dataset.spkts,
+                dataset.allbytes,
+                dataset.sbytes,
+                dataset.state,
+            ]
+            for field in fields_to_convert_to_float:
+                try:
+                    field = field.astype("float64")
+                except (ValueError, AttributeError):
+                    pass
+
+            return dataset
+        except Exception:
+            # Stop the timer
+            self.print("Error in process_features()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def process_training_flows(self, last_number_of_flows_when_trained):
+        """
+        Process only the new flows in the DB since the last training.
+        Store the pandas df in self.flows
+        """
+        try:
+            # Ensure the index is an integer
+            if last_number_of_flows_when_trained is None:
+                last_number_of_flows_when_trained = 0
+            else:
+                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+
+            # We get all the flows so far
+            flows = self.db.get_all_flows()
+            # Only process new flows since last training
+            new_flows = flows[last_number_of_flows_when_trained:]
+
+            # Check how many **different** labels are in the DB
+            labels = self.db.get_labels()
+            if len(labels) == 1:
+                # Insert fake flows for both classes if needed
+                new_flows.append(
+                    {
+                        "starttime": 1594417039.029793,
+                        "dur": "1.9424750804901123",
+                        "saddr": "10.7.10.101",
+                        "sport": "49733",
+                        "daddr": "40.70.224.145",
+                        "dport": "443",
+                        "proto": "tcp",
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
+                        "sbytes": 25517,
+                        "dbytes": 17247,
+                        "appproto": "ssl",
+                        "ground_truth_label": "Malicious",
+                        "module_labels": {
+                            "flowalerts-long-connection": "Malicious"
+                        },
+                    }
+                )
+                new_flows.append(
+                    {
+                        "starttime": 1382355032.706468,
+                        "dur": "10.896695",
+                        "saddr": "147.32.83.52",
+                        "sport": "47956",
+                        "daddr": "80.242.138.72",
+                        "dport": "80",
+                        "proto": "tcp",
+                        "state": "SF",
+                        "spkts": 1,
+                        "dpkts": 0,
+                        "sbytes": 100,
+                        "dbytes": 67596,
+                        "appproto": "http",
+                        "ground_truth_label": "Benign",
+                        "module_labels": {
+                            "flowalerts-long-connection": "Benign"
+                        },
+                    }
+                )
+
+            # Convert to pandas df
+            df_flows = pd.DataFrame(new_flows)
+
+            # Process features
+            df_flows = self.process_features(df_flows)
+
+            # Update the flow to the processed version
+            self.flows = df_flows
+        except Exception:
+            self.print("Error in process_flows()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def process_flow(self, flow_to_process: dict):
+        """
+        Process one flow. Only used during detection in testing
+        returns the pandas df with the processed flow
+        """
+        try:
+            # Convert the flow to a pandas dataframe
+            raw_flow = pd.DataFrame(flow_to_process, index=[0])
+            dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
+            # Update the flow to the processed version
+            return dflow
+        except Exception:
+            # Stop the timer
+            self.print("Error in process_flow()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def detect(self, x_flow) -> Optional[numpy.ndarray]:
+        """
+        Detects the given flow with the current model stored
+        and returns the predection array
+        """
+        try:
+            # clean the flow
+            fields_to_drop = [
+                "label",
+                "module_labels",
+                "uid",
+                "history",
+                "dir_",
+                "endtime",
+                "flow_source",
+                "ground_truth_label",
+                "detailed_ground_truth_label",
+            ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
+            for field in fields_to_drop:
+                try:
+                    x_flow = x_flow.drop(field, axis=1)
+                except (KeyError, ValueError):
+                    pass
+            # Scale the flow
+            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
+            pred: numpy.ndarray = self.clf.predict(x_flow)
+            return pred
+        except Exception as e:
+            self.print(
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+            )
+            self.print(traceback.format_exc(), 0, 1)
+
+    def store_model(self):
+        """
+        Store the trained model on disk
+        """
+        self.print("Storing the trained model and scaler on disk.", 0, 2)
+        with open(self.model_path, "wb") as f:
+            data = pickle.dumps(self.clf)
+            f.write(data)
+        with open(self.scaler_path, "wb") as g:
+            data = pickle.dumps(self.scaler)
+            g.write(data)
+
+    def read_model(self):
+        """
+        Read the trained model from disk
+        """
+        try:
+            self.print("Reading the trained model from disk.", 0, 2)
+            with open(self.model_path, "rb") as f:
+                self.clf = pickle.load(f)
+            self.print("Reading the trained scaler from disk.", 0, 2)
+            with open(self.scaler_path, "rb") as g:
+                self.scaler = pickle.load(g)
+        except FileNotFoundError:
+            # If there is no model, create one empty
+            self.print(
+                "There was no model. " "Creating a new empty model.", 0, 2
+            )
+            self.clf = SGDClassifier(
+                warm_start=True, loss="hinge", penalty="l1"
+            )
+        except EOFError:
+            self.print(
+                "Error reading model from disk. "
+                "Creating a new empty model.",
+                0,
+                2,
+            )
+            self.clf = SGDClassifier(
+                warm_start=True, loss="hinge", penalty="l1"
+            )
+
+    def set_evidence_malicious_flow(self, flow: dict, twid: str):
+        confidence: float = 0.1
+        description = (
+            f"Flow with malicious characteristics by ML. Src IP"
+            f" {flow['saddr']}:{flow['sport']} to "
+            f"{flow['daddr']}:{flow['dport']}"
+        )
+        twid_number = int(twid.replace("timewindow", ""))
+        evidence: Evidence = Evidence(
+            evidence_type=EvidenceType.MALICIOUS_FLOW,
+            attacker=Attacker(
+                direction=Direction.SRC,
+                ioc_type=IoCType.IP,
+                value=flow["saddr"],
+            ),
+            victim=Victim(
+                direction=Direction.DST,
+                ioc_type=IoCType.IP,
+                value=flow["daddr"],
+            ),
+            threat_level=ThreatLevel.LOW,
+            confidence=confidence,
+            description=description,
+            profile=ProfileID(ip=flow["saddr"]),
+            timewindow=TimeWindow(twid_number),
+            uid=[flow["uid"]],
+            timestamp=flow["starttime"],
+            method=Method.AI,
+            src_port=flow["sport"],
+            dst_port=flow["dport"],
+        )
+
+        self.db.set_evidence(evidence)
+
+    def shutdown_gracefully(self):
+        # Confirm that the module is done processing
+        if self.mode == "train":
+            self.store_model()
+
+    def pre_main(self):
+        utils.drop_root_privs()
+        # Load the model
+        self.read_model()
+
+    def main(self):
+        if msg := self.get_msg("new_flow"):
+            # When a new flow arrives
+            msg = json.loads(msg["data"])
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
+            self.flow = msg["flow"]
+            # These following extra fields are expected in testing. update the original
+            # flow dict to have them
+            self.flow.update(
+                {
+                    "state": msg["interpreted_state"],
+                    "label": msg["label"],
+                    "module_labels": msg["module_labels"],
+                }
+            )
+
+            if self.mode == "train":
+                # We are training
+
+                # Is the amount in the DB of labels enough to retrain?
+                # Use labeled flows
+                labels = self.db.get_labels()
+                sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
+                if (
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
+                ):
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows(self.last_number_of_flows_when_trained)
+                        # Train an algorithm
+                        self.train(sum_labeled_flows)
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
+            elif self.mode == "test":
+                # We are testing, which means using the model to detect
+                processed_flow = self.process_flow(self.flow)
+
+                # After processing the flow, it may happen that we
+                # delete icmp/arp/etc so the dataframe can be empty
+                if processed_flow is not None and not processed_flow.empty:
+                    # Predict
+                    pred: numpy.ndarray = self.detect(processed_flow)
+                    if not pred:
+                        # an error occurred
+                        return
+
+                    label = self.flow["label"]
+                    if label and label != "unknown" and label != pred[0]:
+                        # If the user specified a label in test mode,
+                        # and the label is diff from the prediction,
+                        # print in debug mode
+                        self.print(
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} ->'
+                            f' {self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
+                            0,
+                            3,
+                        )
+                    if pred[0] == "Malicious":
+                        # Generate an alert
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
+                        self.print(
+                            f"Prediction {pred[0]} for label {label}"
+                            f' flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} -> '
+                            f'{self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
+                            0,
+                            2,
+                        )
\ No newline at end of file

From 01a1a6156e0d0626e327d683cb828d44475e9eab Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:31:43 +0000
Subject: [PATCH 219/455] Fix plot test

---
 modules/flowmldetection/plot_testing_performance.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 1b4152c6eb..977a68b2d5 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")
@@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
         
         # Avoid log(0), so set the minimum limit a little higher than zero
         if min_val == 0:
-            min_val = 1e-8  # Avoid zero values on the logarithmic scale
+            min_val = 1e-4  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From 0b51f71948efe37e361836cb04bfcedba58dad66 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:50:33 +0000
Subject: [PATCH 220/455] Add testing code to evaluate performance. It is
 optional with a varible

---
 modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++-------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 5e4e9aa462..b17a1baaf0 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -526,36 +526,21 @@ def main(self):
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
-
                 # After processing the flow, it may happen that we
                 # delete icmp/arp/etc so the dataframe can be empty
                 if processed_flow is not None and not processed_flow.empty:
+                    original_label = processed_flow["ground_truth_label"].iloc[0]
                     # Predict
                     pred: numpy.ndarray = self.detect(processed_flow)
                     if not pred:
                         # an error occurred
                         return
 
-                    label = self.flow["label"]
-                    if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
-                        self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            3,
-                        )
                     if pred[0] == "Malicious":
                         # Generate an alert
                         self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
-                            f"Prediction {pred[0]} for label {label}"
+                            f"Prediction {pred[0]} for label {original_label}"
                             f' flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} -> '
                             f'{self.flow["daddr"]}:'
@@ -563,4 +548,43 @@ def main(self):
                             f'{self.flow["proto"]}',
                             0,
                             2,
-                        )
\ No newline at end of file
+                        )
+
+                    # So you can disable this code easily. Since it is used only for evaluating a testing
+                    log_testing_data = True
+                    if log_testing_data:
+                        # Initialize counters if not already done
+                        if not hasattr(self, 'tp'):
+                            self.tp = 0
+                        if not hasattr(self, 'tn'):
+                            self.tn = 0
+                        if not hasattr(self, 'fp'):
+                            self.fp = 0
+                        if not hasattr(self, 'fn'):
+                            self.fn = 0
+
+
+                        # Update counters based on predictions and labels
+                        if pred[0] == "Malicious" and original_label == "Malicious":
+                            self.tp += 1
+                        elif pred[0] == "Benign" and original_label == "Benign":
+                            self.tn += 1
+                        elif pred[0] == "Malicious" and original_label == "Benign":
+                            self.fp += 1
+                        elif pred[0] == "Benign" and original_label == "Malicious":
+                            self.fn += 1
+
+                        testing_log_path = "./modules/flowmldetection/testing_performance.log"
+                        try:
+                            with open(testing_log_path, "a") as log_file:
+                                log_file.write("Testing Performance Log Initialized\n")
+                                # Log the testing performance metrics
+                                log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n")
+
+                                # Log the original flow for false positives and false negatives
+                                if pred[0] == "Malicious" and original_label == "Benign":
+                                    log_file.write(f"False Positive Flow: {self.flow}\n")
+                                elif pred[0] == "Benign" and original_label == "Malicious":
+                                    log_file.write(f"False Negative Flow: {self.flow}\n")
+                        except Exception as e:
+                            self.print(f"Error initializing testing performance log: {e}", 0, 1)
\ No newline at end of file

From e2da4cbde7d3b54ce2e90749bcd9e4c7bdbb8be2 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 19:04:00 +0000
Subject: [PATCH 221/455] Fix plots

---
 .../plot_testing_performance.py               | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 977a68b2d5..6865415cdf 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 import sys
 import numpy as np
+import argparse
 
 def process_file(file_path):
     # Initialize the counters for the values
@@ -49,7 +50,7 @@ def process_file(file_path):
     
     return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
 
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number):
     # Separate the values into two groups based on their proximity to 0 or 1
     close_to_0 = {
         'FPR': [], 'FNR': []
@@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0 (linear scale)
-    plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
+    plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False)
 
     # Print the final values
-    print("\nFinal Metric Values:")
+    print("\nFinal Metric Values for Experiment", experiment_number)
     print(f"Final FPR: {FPR_values[-1]:.4f}")
     print(f"Final FNR: {FNR_values[-1]:.4f}")
     print(f"Final TNR: {TNR_values[-1]:.4f}")
@@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     print(f"Final MCC: {MCC_values[-1]:.4f}")
     print(f"Final Recall: {recall_values[-1]:.4f}")
 
-def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
+def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False):
     plt.figure(figsize=(12, 8))
     
     # Only plot the metrics that exist in the dictionary
@@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
             min_val = 1e-4  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
 
+    # Add the experiment number to the plot title
     plt.xlabel('Index')
     plt.ylabel('Metric Value')
-    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
+    plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time')
     plt.legend()
     
     # Save the plot
@@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     plt.close()
 
 def main():
-    if len(sys.argv) != 2:
-        print("Usage: python script.py <file_path>")
-        sys.exit(1)
+    # Set up argument parsing
+    parser = argparse.ArgumentParser(description='Plot testing performance metrics.')
+    parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file')
+    parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number')
+
+    args = parser.parse_args()
     
-    file_path = sys.argv[1]
+    file_path = args.file
+    experiment_number = args.experiment
     
     FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number)
 
 if __name__ == "__main__":
     main()

From e174fc4574b68e1aa2dedfdab223d3b42c60f282 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 19:14:51 +0000
Subject: [PATCH 222/455] Fix train plot

---
 .../flowmldetection/plot_train_performance.py | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 80e13e9515..244df13d28 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -5,7 +5,7 @@
 import argparse
 import os
 
-def plot_log_data(file_path):
+def plot_log_data(file_path, experiment_number):
     # Read the log data from the file
     with open(file_path, 'r') as file:
         log_data = file.read()
@@ -28,7 +28,8 @@ def plot_log_data(file_path):
 
     # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    plot_file = os.path.join(dir_name, 'performance_metrics_training.png')
+    # Append experiment number to the filename
+    plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png')
 
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))
@@ -55,18 +56,18 @@ def plot_log_data(file_path):
     for i, value in enumerate(df["Total labels"]):
         ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
 
-    # Adding title and legend
-    plt.title('Training performance')
+    # Adding title and legend with experiment number in title
+    plt.title(f'Training performance - Experiment {experiment_number}')
     fig.tight_layout()
 
     # Move both legends further to the right
-    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
-    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1)
 
     # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.7)
+    plt.subplots_adjust(right=0.75)
 
-    # Save plot to the same folder as the log file
+    # Save plot to the same folder as the log file with experiment number in filename
     plt.savefig(plot_file)
 
     # Display the plot
@@ -75,13 +76,14 @@ def plot_log_data(file_path):
 def main():
     # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
-    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
+    parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file")
+    parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename")
     
     # Handle -h / --help
     args = parser.parse_args()
 
     # Call the function to process the log file
-    plot_log_data(args.log_file)
+    plot_log_data(args.file, args.experiment)
 
 if __name__ == "__main__":
     main()

From e7fdbfdbd1b5c3de8bb60227c4e02454abe5c993 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 21:14:48 +0000
Subject: [PATCH 223/455] Fix plots

---
 .../flowmldetection/plot_train_performance.py | 122 ++++++++++--------
 1 file changed, 71 insertions(+), 51 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 244df13d28..5212dfeeaf 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -4,85 +4,105 @@
 import sys
 import argparse
 import os
+import matplotlib.ticker as ticker
 
 def plot_log_data(file_path, experiment_number):
     # Read the log data from the file
     with open(file_path, 'r') as file:
         log_data = file.read()
 
-    # Define regex pattern to extract relevant data from each line
-    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
+    # Regex pattern for the new log format
+    pattern = (
+        r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: "
+        r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), "
+        r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\."
+    )
 
     # Parse the log file
     data = re.findall(pattern, log_data)
 
     # Convert data to a DataFrame
-    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
+    columns = [
+        "Total labels", "Background", "Benign", "Malicious",
+        "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"
+    ]
+    df = pd.DataFrame(data, columns=columns)
     df = df.astype({
+        "Total labels": float,
         "Background": int,
         "Benign": int,
         "Malicious": int,
-        "Total labels": float,
-        "Score": float
+        "FPR": float,
+        "TNR": float,
+        "TPR": float,
+        "FNR": float,
+        "F1": float,
+        "Precision": float,
+        "Accuracy": float,
+        "MCC": float,
+        "Recall": float,
     })
 
-    # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    # Append experiment number to the filename
-    plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png')
-
-    # Plotting the values
-    fig, ax1 = plt.subplots(figsize=(10, 6))
 
-    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
-    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
+    # --- Plot 1: Number of labels (linear scale, no total labels) ---
+    fig1, ax1 = plt.subplots(figsize=(10, 6))
+    ax1.plot(df.index, df["Background"], label="Background", color='black')
+    ax1.plot(df.index, df["Benign"], label="Benign", color='cyan')
+    ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta')
     ax1.set_xlabel('Index')
-    ax1.set_ylabel('Score', color='tab:blue')
-    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
-    ax1.tick_params(axis='y', labelcolor='tab:blue')
-
-    # Create the second y-axis for the Background, Benign, Malicious
-    ax2 = ax1.twinx()
-    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
-    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
-    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
-    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
-    
-    # Set appropriate scale for right y-axis based on the data
-    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
-    ax2.tick_params(axis='y', labelcolor='tab:red')
-
-    # Annotating Total labels as text on the plot
-    for i, value in enumerate(df["Total labels"]):
-        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
-
-    # Adding title and legend with experiment number in title
-    plt.title(f'Training performance - Experiment {experiment_number}')
-    fig.tight_layout()
+    ax1.set_ylabel('Label Counts')
+    # No log scale here
+    ax1.set_title(f'Label Counts - Experiment {experiment_number}')
+    ax1.legend()
+    ax1.yaxis.set_major_locator(ticker.MaxNLocator(70))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png'))
+
+    # --- Plot 2: FNR and FPR (log scale) ---
+    fig2, ax2 = plt.subplots(figsize=(10, 6))
+    ax2.plot(df.index, df["FNR"], label="FNR", color='red')
+    ax2.plot(df.index, df["FPR"], label="FPR", color='blue')
+    ax2.set_xlabel('Index')
+    ax2.set_ylabel('Rate')
+    ax2.set_yscale('log')
+    ax2.set_title(f'FNR and FPR - Experiment {experiment_number}')
+    ax2.legend()
+    ax2.yaxis.set_major_locator(ticker.MaxNLocator(100))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png'))
+
+    # --- Plot 3: Other metrics (log scale) ---
+    fig3, ax3 = plt.subplots(figsize=(12, 7))
+    metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"]
+    colors_rest = [
+        'tab:blue', 'tab:green', 'tab:purple', 'tab:brown',
+        'tab:gray', 'tab:pink', 'tab:olive'
+    ]
+    for metric, color in zip(metrics_rest, colors_rest):
+        ax3.plot(df.index, df[metric], label=metric, color=color)
+    ax3.set_xlabel('Index')
+    ax3.set_ylabel('Metric Value')
+    ax3.set_yscale('log')
+    ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}')
+    ax3.legend()
+    ax3.yaxis.set_major_locator(ticker.MaxNLocator(50))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png'))
 
-    # Move both legends further to the right
-    ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1)
-    ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1)
-
-    # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.75)
-
-    # Save plot to the same folder as the log file with experiment number in filename
-    plt.savefig(plot_file)
-
-    # Display the plot
     plt.show()
 
+    # --- Print final values in terminal ---
+    print("\nFinal values at last training step:")
+    for col in ["Total labels", "Background", "Benign", "Malicious",
+                "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]:
+        print(f"{col}: {df[col].iloc[-1]}")
+
 def main():
-    # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
     parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file")
     parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename")
-    
-    # Handle -h / --help
     args = parser.parse_args()
-
-    # Call the function to process the log file
     plot_log_data(args.file, args.experiment)
 
 if __name__ == "__main__":

From fdbbbb5e9b127117ca089dab05bd1fe49f4e5508 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 21:16:01 +0000
Subject: [PATCH 224/455] Add performance metrics to the training evaluation

---
 modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++-----
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index b17a1baaf0..2c60cd4034 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,7 +10,16 @@
 import json
 import traceback
 import warnings
-import os
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.metrics import (
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    accuracy_score,
+    matthews_corrcoef,
+    recall_score,
+)
+
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -86,21 +95,21 @@ def write_to_training_log(self, message: str):
         except Exception as e:
             self.print(f"Error writing to training log: {e}", 0, 1)
 
-    def train(self, sum_labeled_flows):
+    def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
+            # Create y_flow with the label
+            y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label)
             # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("ground_truth_label", axis=1)
             # Drop the detailed labels
             X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
             # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
 
-            # Normalize this batch of data so far. This can get progressivle slow
+            # Normalize this batch of data so far. This can get progressively slow
             X_flow = self.scaler.fit_transform(X_flow)
 
             # Count the number of labels of each type in this epoc
@@ -120,18 +129,43 @@ def train(self, sum_labeled_flows):
                 self.print("Error while calling clf.train()")
                 self.print(traceback.format_exc(), 0, 1)
 
-            # See score so far in training
-            score = self.clf.score(X_flow, y_flow)
+            # Predict on the training data
+            y_pred = self.clf.predict(X_flow)
 
-            #self.print(f"	Training Score: {score}", 1, 0)
-            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
+            # For metrics, let's focus on Malicious vs Benign (ignore Background)
+            mask = (y_flow == "Malicious") | (y_flow == "Benign")
+            y_true_bin = y_flow[mask]
+            y_pred_bin = y_pred[mask]
+
+            # Map to binary: Malicious=1, Benign=0
+            y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0)
+            y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0)
+
+            # Compute confusion matrix: tn, fp, fn, tp
+            tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0)
+
+            # Compute metrics
+            FPR = fp / (fp + tn) if (fp + tn) > 0 else 0
+            TNR = tn / (tn + fp) if (tn + fp) > 0 else 0
+            TPR = tp / (tp + fn) if (tp + fn) > 0 else 0
+            FNR = fn / (fn + tp) if (fn + tp) > 0 else 0
+            F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0)
+            PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0)
+            ACCU = accuracy_score(y_true_bin, y_pred_bin)
+            MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0
+            RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0)
 
             # Store the models on disk
             self.store_model()
 
             # Log training information
-            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
-            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
+            self.write_to_training_log(
+                f"Total labels: {sum_labeled_flows}, "
+                f"Background: {epoch_label_counts['Background']}. "
+                f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. "
+                f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, "
+                f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}."
+            )
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
@@ -520,7 +554,7 @@ def main(self):
                         # for pandas
                         self.process_training_flows(self.last_number_of_flows_when_trained)
                         # Train an algorithm
-                        self.train(sum_labeled_flows)
+                        self.train(sum_labeled_flows, self.last_number_of_flows_when_trained)
                         self.last_number_of_flows_when_trained = sum_labeled_flows
 
             elif self.mode == "test":

From b7b2477f4939479d223c699e240cf3f6a33d2c10 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sun, 4 May 2025 12:50:46 +0000
Subject: [PATCH 225/455] Fix experiment names

---
 modules/flowmldetection/plot_train_performance.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 5212dfeeaf..304f0f4ead 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number):
     ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta')
     ax1.set_xlabel('Index')
     ax1.set_ylabel('Label Counts')
-    # No log scale here
     ax1.set_title(f'Label Counts - Experiment {experiment_number}')
     ax1.legend()
     ax1.yaxis.set_major_locator(ticker.MaxNLocator(70))
+    ax1.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png'))
 
     # --- Plot 2: FNR and FPR (log scale) ---
     fig2, ax2 = plt.subplots(figsize=(10, 6))
@@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number):
     ax2.set_title(f'FNR and FPR - Experiment {experiment_number}')
     ax2.legend()
     ax2.yaxis.set_major_locator(ticker.MaxNLocator(100))
+    ax2.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png'))
 
     # --- Plot 3: Other metrics (log scale) ---
     fig3, ax3 = plt.subplots(figsize=(12, 7))
@@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number):
     ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}')
     ax3.legend()
     ax3.yaxis.set_major_locator(ticker.MaxNLocator(50))
+    ax3.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png'))
 
     plt.show()
 

From 27b2b567ea395023664434d1bbb11819e3625776 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Mon, 5 May 2025 15:24:12 +0300
Subject: [PATCH 226/455] test_profiler: update unit tests

---
 tests/test_profiler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index b967c7880f..e62bdd8e74 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -481,7 +481,6 @@ def test_read_configuration(
     mock_conf.local_whitelist_path.return_value = "path/to/whitelist"
     mock_conf.ts_format.return_value = "unixtimestamp"
     mock_conf.analysis_direction.return_value = "all"
-    mock_conf.label.return_value = "malicious"
     mock_conf.get_tw_width_as_float.return_value = 1.0
     mock_conf.client_ips.return_value = ["192.168.1.1", "10.0.0.1"]
 
@@ -490,7 +489,6 @@ def test_read_configuration(
     assert profiler.local_whitelist_path == "path/to/whitelist"
     assert profiler.timeformat == "unixtimestamp"
     assert profiler.analysis_direction == "all"
-    assert profiler.label == "malicious"
     assert profiler.width == 1.0
     assert profiler.client_ips == ["192.168.1.1", "10.0.0.1"]
 

From 2c9fea74846d842820fa227c36742c1f91eb153e Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Mon, 5 May 2025 16:43:05 +0000
Subject: [PATCH 227/455] Fix that the training and testing logs files were
 appened instead of rewritten

---
 modules/flowmldetection/flowmldetection.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 2c60cd4034..9a920b4e25 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -90,7 +90,7 @@ def write_to_training_log(self, message: str):
         Write a message to the training log file.
         """
         try:
-            with open(self.training_log_path, "a") as log_file:
+            with open(self.training_log_path, "w") as log_file:
                 log_file.write(message + "\n")
         except Exception as e:
             self.print(f"Error writing to training log: {e}", 0, 1)
@@ -610,8 +610,7 @@ def main(self):
 
                         testing_log_path = "./modules/flowmldetection/testing_performance.log"
                         try:
-                            with open(testing_log_path, "a") as log_file:
-                                log_file.write("Testing Performance Log Initialized\n")
+                            with open(testing_log_path, "w") as log_file:
                                 # Log the testing performance metrics
                                 log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n")
 

From f5b28994ab20da76a77c42ebea793d31f81d9850 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Mon, 5 May 2025 22:45:16 +0000
Subject: [PATCH 228/455] Fix an issue of storing the new log files

---
 modules/flowmldetection/flowmldetection.py | 49 ++++++++++------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9a920b4e25..9139066f08 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -72,11 +72,19 @@ def init(self):
         self.scaler = StandardScaler()
         self.model_path = "./modules/flowmldetection/model.bin"
         self.scaler_path = "./modules/flowmldetection/scaler.bin"
-
-        # Initialize the training log file
-        self.training_log_path = "./modules/flowmldetection/training.log"
-        with open(self.training_log_path, "w") as log_file:
-            log_file.write("Training Log Initialized\n")
+        self.init_log_file()
+    
+    def init_log_file(self):
+        """
+        Init the log file for training or testing
+        """
+        if self.mode == "train":
+            # Initialize the training log file
+            self.log_path = "./modules/flowmldetection/training.log"
+        elif self.mode == "test":
+            # Initialize the testing log file
+            self.log_path = "./modules/flowmldetection/testing.log"
+        self.log_file = open(self.log_path, "w")
 
     def read_configuration(self):
         conf = ConfigParser()
@@ -85,15 +93,14 @@ def read_configuration(self):
         # in case the flows do not have a label themselves
         self.label = conf.label()
 
-    def write_to_training_log(self, message: str):
+    def write_to_log(self, message: str):
         """
-        Write a message to the training log file.
+        Write a message to the local log file.
         """
         try:
-            with open(self.training_log_path, "w") as log_file:
-                log_file.write(message + "\n")
+            self.log_file.write(message + "\n")
         except Exception as e:
-            self.print(f"Error writing to training log: {e}", 0, 1)
+            self.print(f"Error writing to log: {e}", 0, 1)
 
     def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         """
@@ -159,7 +166,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             self.store_model()
 
             # Log training information
-            self.write_to_training_log(
+            self.write_to_log(
                 f"Total labels: {sum_labeled_flows}, "
                 f"Background: {epoch_label_counts['Background']}. "
                 f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. "
@@ -169,7 +176,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-            self.write_to_training_log("Error occurred during training.")
+            self.write_to_log("Error occurred during training.")
 
     def process_features(self, dataset):
         """
@@ -597,7 +604,6 @@ def main(self):
                         if not hasattr(self, 'fn'):
                             self.fn = 0
 
-
                         # Update counters based on predictions and labels
                         if pred[0] == "Malicious" and original_label == "Malicious":
                             self.tp += 1
@@ -605,19 +611,10 @@ def main(self):
                             self.tn += 1
                         elif pred[0] == "Malicious" and original_label == "Benign":
                             self.fp += 1
+                            self.write_to_log(f"False Positive Flow: {self.flow}")
                         elif pred[0] == "Benign" and original_label == "Malicious":
                             self.fn += 1
+                            self.write_to_log(f"False Negative Flow: {self.flow}")
 
-                        testing_log_path = "./modules/flowmldetection/testing_performance.log"
-                        try:
-                            with open(testing_log_path, "w") as log_file:
-                                # Log the testing performance metrics
-                                log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n")
-
-                                # Log the original flow for false positives and false negatives
-                                if pred[0] == "Malicious" and original_label == "Benign":
-                                    log_file.write(f"False Positive Flow: {self.flow}\n")
-                                elif pred[0] == "Benign" and original_label == "Malicious":
-                                    log_file.write(f"False Negative Flow: {self.flow}\n")
-                        except Exception as e:
-                            self.print(f"Error initializing testing performance log: {e}", 0, 1)
\ No newline at end of file
+                        # Log the testing performance metrics
+                        self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}")
\ No newline at end of file

From 1e6d0d18f18e03a35ba414072ad58c4d033b4383 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Sat, 10 May 2025 16:21:08 +0300
Subject: [PATCH 229/455] enable/ disable training and testing.log with a param
 in the config file

---
 .secrets.baseline                           |   6 +-
 config/slips.yaml                           |   3 +
 modules/flowmldetection/flowmldetection.py  | 140 +++++++++++++-------
 modules/riskiq/riskiq.py                    |   2 +-
 modules/update_manager/update_manager.py    |   2 +-
 slips_files/common/parsers/config_parser.py |   7 +-
 6 files changed, 109 insertions(+), 51 deletions(-)

diff --git a/.secrets.baseline b/.secrets.baseline
index 37fe2abcba..aa5615109c 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -149,14 +149,14 @@
         "filename": "config/slips.yaml",
         "hashed_secret": "4cac50cee3ad8e462728e711eac3e670753d5016",
         "is_verified": false,
-        "line_number": 224
+        "line_number": 226
       },
       {
         "type": "Secret Keyword",
         "filename": "config/slips.yaml",
         "hashed_secret": "d033e22ae348aeb5660fc2140aec35850c4da997",
         "is_verified": false,
-        "line_number": 394
+        "line_number": 396
       }
     ],
     "dataset/test14-malicious-zeek-dir/http.log": [
@@ -7192,5 +7192,5 @@
       }
     ]
   },
-  "generated_at": "2025-02-13T22:47:52Z"
+  "generated_at": "2025-05-10T13:18:46Z"
 }
diff --git a/config/slips.yaml b/config/slips.yaml
index 8736eaf511..dabb388c09 100644
--- a/config/slips.yaml
+++ b/config/slips.yaml
@@ -214,6 +214,9 @@ flowmldetection:
   # You should have trained at least once with 'Normal' data and once with
   # 'Malicious' data in order for the test to work.
   mode: test
+  # creates an extra log file called training.log/testing.log in the
+  # ouptput dir with performance metrics depending on the mode.
+  create_performance_metrics_log_files: False
 
 #############################
 virustotal:
diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9139066f08..2a515d0cfa 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,9 +10,8 @@
 import json
 import traceback
 import warnings
-from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.metrics import confusion_matrix
 from sklearn.metrics import (
-    confusion_matrix,
     f1_score,
     precision_score,
     accuracy_score,
@@ -37,6 +36,7 @@
     Method,
 )
 
+
 # This horrible hack is only to stop sklearn from printing those warnings
 def warn(*args, **kwargs):
     pass
@@ -73,7 +73,7 @@ def init(self):
         self.model_path = "./modules/flowmldetection/model.bin"
         self.scaler_path = "./modules/flowmldetection/scaler.bin"
         self.init_log_file()
-    
+
     def init_log_file(self):
         """
         Init the log file for training or testing
@@ -92,11 +92,16 @@ def read_configuration(self):
         # This is the global label in the configuration,
         # in case the flows do not have a label themselves
         self.label = conf.label()
+        self.enable_logs: bool = conf.create_performance_metrics_log_files()
 
     def write_to_log(self, message: str):
         """
-        Write a message to the local log file.
+        Write a message to the local log file if
+        create_performance_metrics_log_files is enabled in slips.yaml
         """
+        if not self.enable_logs:
+            return
+
         try:
             self.log_file.write(message + "\n")
         except Exception as e:
@@ -108,7 +113,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         """
         try:
             # Create y_flow with the label
-            y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label)
+            y_flow = numpy.full(
+                self.flows.shape[0], self.flows.ground_truth_label
+            )
             # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("ground_truth_label", axis=1)
             # Drop the detailed labels
@@ -130,7 +137,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             try:
                 # Online incremental learning
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
+                    X_flow,
+                    y_flow,
+                    classes=["Background", "Malicious", "Benign"],
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -149,7 +158,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0)
 
             # Compute confusion matrix: tn, fp, fn, tp
-            tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0)
+            tn, fp, fn, tp = (
+                confusion_matrix(y_true_bin, y_pred_bin, labels=[0, 1]).ravel()
+                if len(set(y_true_bin)) > 1
+                else (0, 0, 0, 0)
+            )
 
             # Compute metrics
             FPR = fp / (fp + tn) if (fp + tn) > 0 else 0
@@ -159,7 +172,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0)
             PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0)
             ACCU = accuracy_score(y_true_bin, y_pred_bin)
-            MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0
+            MCC = (
+                matthews_corrcoef(y_true_bin, y_pred_bin)
+                if len(set(y_true_bin)) > 1
+                else 0
+            )
             RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0)
 
             # Store the models on disk
@@ -189,7 +206,8 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            # If te proto is in the list to delete and there is only one flow,
+            # then the dataset will be empty
             if dataset.empty:
                 # DataFrame is empty now, so return empty
                 return dataset
@@ -295,7 +313,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
             if last_number_of_flows_when_trained is None:
                 last_number_of_flows_when_trained = 0
             else:
-                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+                last_number_of_flows_when_trained = int(
+                    last_number_of_flows_when_trained
+                )
 
             # We get all the flows so far
             flows = self.db.get_all_flows()
@@ -399,21 +419,21 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             ]
             # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
             # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
+            """ [Flow ML Detection] Error in detect() while processing
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887
+            The feature names should match those that were passed during fit.
+            Feature names unseen at fit time:
+            - bytes
+            """
 
             # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
+            # [Flow ML Detection] Error in detect() while processing
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887
+            # The feature names should match those that were passed during fit.
+            # Feature names must be in the same order as they were in fit.
+
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)
@@ -540,17 +560,19 @@ def main(self):
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
 
-                # The min labels to retrain is the min number of flows 
+                # The min labels to retrain is the min number of flows
                 # we should have seen so far in this capture to start training
                 # This is so we dont _start_ training with only 1 flow
 
-                # Once we are over the start minimum, the second condition is 
+                # Once we are over the start minimum, the second condition is
                 # to force to retrain every a minimum_labels_to_retrain number
                 # of flows. So we dont retrain every 1 flow.
-                if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
-                ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                if sum_labeled_flows >= self.minimum_labels_to_start_train:
+                    if (
+                        sum_labeled_flows
+                        - self.last_number_of_flows_when_trained
+                        >= self.minimum_labels_to_retrain
+                    ):
                         # So for example we retrain every 50 labels and only when
                         # we have at least 50 labels
                         self.print(
@@ -559,10 +581,17 @@ def main(self):
                         )
                         # Process all flows in the DB and make them ready
                         # for pandas
-                        self.process_training_flows(self.last_number_of_flows_when_trained)
+                        self.process_training_flows(
+                            self.last_number_of_flows_when_trained
+                        )
                         # Train an algorithm
-                        self.train(sum_labeled_flows, self.last_number_of_flows_when_trained)
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
+                        self.train(
+                            sum_labeled_flows,
+                            self.last_number_of_flows_when_trained,
+                        )
+                        self.last_number_of_flows_when_trained = (
+                            sum_labeled_flows
+                        )
 
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
@@ -570,7 +599,9 @@ def main(self):
                 # After processing the flow, it may happen that we
                 # delete icmp/arp/etc so the dataframe can be empty
                 if processed_flow is not None and not processed_flow.empty:
-                    original_label = processed_flow["ground_truth_label"].iloc[0]
+                    original_label = processed_flow["ground_truth_label"].iloc[
+                        0
+                    ]
                     # Predict
                     pred: numpy.ndarray = self.detect(processed_flow)
                     if not pred:
@@ -591,30 +622,49 @@ def main(self):
                             2,
                         )
 
-                    # So you can disable this code easily. Since it is used only for evaluating a testing
+                    # So you can disable this code easily. Since it is used
+                    # only for evaluating a testing
                     log_testing_data = True
                     if log_testing_data:
                         # Initialize counters if not already done
-                        if not hasattr(self, 'tp'):
+                        if not hasattr(self, "tp"):
                             self.tp = 0
-                        if not hasattr(self, 'tn'):
+                        if not hasattr(self, "tn"):
                             self.tn = 0
-                        if not hasattr(self, 'fp'):
+                        if not hasattr(self, "fp"):
                             self.fp = 0
-                        if not hasattr(self, 'fn'):
+                        if not hasattr(self, "fn"):
                             self.fn = 0
 
                         # Update counters based on predictions and labels
-                        if pred[0] == "Malicious" and original_label == "Malicious":
+                        if (
+                            pred[0] == "Malicious"
+                            and original_label == "Malicious"
+                        ):
                             self.tp += 1
-                        elif pred[0] == "Benign" and original_label == "Benign":
+                        elif (
+                            pred[0] == "Benign" and original_label == "Benign"
+                        ):
                             self.tn += 1
-                        elif pred[0] == "Malicious" and original_label == "Benign":
+                        elif (
+                            pred[0] == "Malicious"
+                            and original_label == "Benign"
+                        ):
                             self.fp += 1
-                            self.write_to_log(f"False Positive Flow: {self.flow}")
-                        elif pred[0] == "Benign" and original_label == "Malicious":
+                            self.write_to_log(
+                                f"False Positive Flow: {self.flow}"
+                            )
+                        elif (
+                            pred[0] == "Benign"
+                            and original_label == "Malicious"
+                        ):
                             self.fn += 1
-                            self.write_to_log(f"False Negative Flow: {self.flow}")
+                            self.write_to_log(
+                                f"False Negative Flow: {self.flow}"
+                            )
 
                         # Log the testing performance metrics
-                        self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}")
\ No newline at end of file
+                        self.write_to_log(
+                            f"TP: {self.tp}, TN: {self.tn},"
+                            f" FP: {self.fp}, FN: {self.fn}"
+                        )
diff --git a/modules/riskiq/riskiq.py b/modules/riskiq/riskiq.py
index 5abf2ddb19..7b5653997e 100644
--- a/modules/riskiq/riskiq.py
+++ b/modules/riskiq/riskiq.py
@@ -25,7 +25,7 @@ def init(self):
 
     def read_configuration(self):
         conf = ConfigParser()
-        risk_iq_credentials_path = conf.RiskIQ_credentials_path()
+        risk_iq_credentials_path = conf.risk_iq_credentials_path()
         try:
             with open(risk_iq_credentials_path, "r") as f:
                 self.riskiq_email = f.readline().replace("\n", "")
diff --git a/modules/update_manager/update_manager.py b/modules/update_manager/update_manager.py
index c6bf0013eb..2de0abf8eb 100644
--- a/modules/update_manager/update_manager.py
+++ b/modules/update_manager/update_manager.py
@@ -119,7 +119,7 @@ def read_riskiq_creds(risk_iq_credentials_path):
         self.ssl_feeds_path = conf.ssl_feeds()
         self.ssl_feeds = self.get_feed_details(self.ssl_feeds_path)
 
-        risk_iq_credentials_path = conf.RiskIQ_credentials_path()
+        risk_iq_credentials_path = conf.risk_iq_credentials_path()
         read_riskiq_creds(risk_iq_credentials_path)
         self.riskiq_update_period = conf.riskiq_update_period()
 
diff --git a/slips_files/common/parsers/config_parser.py b/slips_files/common/parsers/config_parser.py
index 40f1b044bc..e208f78816 100644
--- a/slips_files/common/parsers/config_parser.py
+++ b/slips_files/common/parsers/config_parser.py
@@ -418,7 +418,12 @@ def data_exfiltration_threshold(self):
     def get_ml_mode(self):
         return self.read_configuration("flowmldetection", "mode", "test")
 
-    def RiskIQ_credentials_path(self):
+    def create_performance_metrics_log_files(self) -> bool:
+        return self.read_configuration(
+            "flowmldetection", "create_performance_metrics_log_files", False
+        )
+
+    def risk_iq_credentials_path(self):
         return self.read_configuration(
             "threatintelligence", "RiskIQ_credentials_path", ""
         )

From 65206b61a2009dfebd8bdc938ffe0a23fd90c943 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Sat, 10 May 2025 16:23:58 +0300
Subject: [PATCH 230/455] dont create an empty logfile when
 create_performance_metrics_log_files is set to false

---
 modules/flowmldetection/flowmldetection.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 2a515d0cfa..9305197d3e 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -78,6 +78,9 @@ def init_log_file(self):
         """
         Init the log file for training or testing
         """
+        if not self.enable_logs:
+            return
+
         if self.mode == "train":
             # Initialize the training log file
             self.log_path = "./modules/flowmldetection/training.log"

From cdbf9d386f4c4063bbf237e952bbadafef307d7f Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Sat, 10 May 2025 16:29:30 +0300
Subject: [PATCH 231/455] when enabled, create testing.log or training.log in
 the current output dir

---
 modules/flowmldetection/flowmldetection.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9305197d3e..f618195bce 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -3,6 +3,7 @@
 
 # SPDX-License-Identifier: GPL-2.0-only
 import numpy
+import os
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler
 import pickle
@@ -83,10 +84,10 @@ def init_log_file(self):
 
         if self.mode == "train":
             # Initialize the training log file
-            self.log_path = "./modules/flowmldetection/training.log"
+            self.log_path = os.path.join(self.output_dir, "training.log")
         elif self.mode == "test":
             # Initialize the testing log file
-            self.log_path = "./modules/flowmldetection/testing.log"
+            self.log_path = os.path.join(self.output_dir, "testing.log")
         self.log_file = open(self.log_path, "w")
 
     def read_configuration(self):

From 68e588ab828fbd2da1b0251e3e3c2fd00f736796 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Sat, 10 May 2025 16:43:32 +0300
Subject: [PATCH 232/455] Add an enum called labels with either Benign or
 Malicious so the labels are unified.

---
 modules/flowmldetection/flowmldetection.py | 65 +++++++++++-----------
 slips_files/core/structures/labels.py      | 11 ++++
 2 files changed, 43 insertions(+), 33 deletions(-)
 create mode 100644 slips_files/core/structures/labels.py

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f618195bce..e828058ee4 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -19,11 +19,10 @@
     matthews_corrcoef,
     recall_score,
 )
-
-
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
 from slips_files.common.abstracts.module import IModule
+from slips_files.core.structures.labels import Label
 from slips_files.core.structures.evidence import (
     Evidence,
     ProfileID,
@@ -45,6 +44,10 @@ def warn(*args, **kwargs):
 
 warnings.warn = warn
 
+BACKGROUND = Label.BACKGROUND.name
+BENIGN = Label.BENIGN.name
+MALICIOUS = Label.MALICIOUS.name
+
 
 class FlowMLDetection(IModule):
     # Name: short name of the module. Do not use spaces
@@ -132,9 +135,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
 
             # Count the number of labels of each type in this epoc
             epoch_label_counts = {
-                "Background": (y_flow == "Background").sum(),
-                "Malicious": (y_flow == "Malicious").sum(),
-                "Benign": (y_flow == "Benign").sum(),
+                BACKGROUND: (y_flow == BACKGROUND).sum(),
+                MALICIOUS: (y_flow == MALICIOUS).sum(),
+                BENIGN: (y_flow == BENIGN).sum(),
             }
 
             # Train
@@ -143,7 +146,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
                 self.clf.partial_fit(
                     X_flow,
                     y_flow,
-                    classes=["Background", "Malicious", "Benign"],
+                    classes=[BACKGROUND, MALICIOUS, BENIGN],
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -153,13 +156,13 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             y_pred = self.clf.predict(X_flow)
 
             # For metrics, let's focus on Malicious vs Benign (ignore Background)
-            mask = (y_flow == "Malicious") | (y_flow == "Benign")
+            mask = (y_flow == MALICIOUS) | (y_flow == BENIGN)
             y_true_bin = y_flow[mask]
             y_pred_bin = y_pred[mask]
 
             # Map to binary: Malicious=1, Benign=0
-            y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0)
-            y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0)
+            y_true_bin = numpy.where(y_true_bin == MALICIOUS, 1, 0)
+            y_pred_bin = numpy.where(y_pred_bin == MALICIOUS, 1, 0)
 
             # Compute confusion matrix: tn, fp, fn, tp
             tn, fp, fn, tp = (
@@ -190,9 +193,12 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             self.write_to_log(
                 f"Total labels: {sum_labeled_flows}, "
                 f"Background: {epoch_label_counts['Background']}. "
-                f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. "
-                f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, "
-                f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}."
+                f"Benign: {epoch_label_counts['Benign']}. "
+                f"Malicious: {epoch_label_counts[MALICIOUS]}. "
+                f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, "
+                f"TPR={TPR:.4f}, FNR={FNR:.4f}, "
+                f"F1={F1:.4f}, Precision={PREC:.4f}, "
+                f"Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}."
             )
         except Exception:
             self.print("Error in train().", 0, 1)
@@ -345,9 +351,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 25517,
                         "dbytes": 17247,
                         "appproto": "ssl",
-                        "ground_truth_label": "Malicious",
+                        "ground_truth_label": MALICIOUS,
                         "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
+                            "flowalerts-long-connection": MALICIOUS
                         },
                     }
                 )
@@ -366,9 +372,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 100,
                         "dbytes": 67596,
                         "appproto": "http",
-                        "ground_truth_label": "Benign",
+                        "ground_truth_label": BENIGN,
                         "module_labels": {
-                            "flowalerts-long-connection": "Benign"
+                            "flowalerts-long-connection": BENIGN
                         },
                     }
                 )
@@ -421,7 +427,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "ground_truth_label",
                 "detailed_ground_truth_label",
             ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # For argus binetflows this fails because ther is a field calle
+            # bytes that was not in other flows. It should be called allbytes.
             # Error
             """ [Flow ML Detection] Error in detect() while processing
             dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes
@@ -546,8 +553,8 @@ def main(self):
             self.twid = msg["twid"]
             self.profileid = msg["profileid"]
             self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
-            # flow dict to have them
+            # These following extra fields are expected in testing.
+            # update the original flow dict to have them
             self.flow.update(
                 {
                     "state": msg["interpreted_state"],
@@ -612,7 +619,7 @@ def main(self):
                         # an error occurred
                         return
 
-                    if pred[0] == "Malicious":
+                    if pred[0] == MALICIOUS:
                         # Generate an alert
                         self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
@@ -642,26 +649,18 @@ def main(self):
 
                         # Update counters based on predictions and labels
                         if (
-                            pred[0] == "Malicious"
-                            and original_label == "Malicious"
+                            pred[0] == MALICIOUS
+                            and original_label == MALICIOUS
                         ):
                             self.tp += 1
-                        elif (
-                            pred[0] == "Benign" and original_label == "Benign"
-                        ):
+                        elif pred[0] == BENIGN and original_label == BENIGN:
                             self.tn += 1
-                        elif (
-                            pred[0] == "Malicious"
-                            and original_label == "Benign"
-                        ):
+                        elif pred[0] == MALICIOUS and original_label == BENIGN:
                             self.fp += 1
                             self.write_to_log(
                                 f"False Positive Flow: {self.flow}"
                             )
-                        elif (
-                            pred[0] == "Benign"
-                            and original_label == "Malicious"
-                        ):
+                        elif pred[0] == BENIGN and original_label == MALICIOUS:
                             self.fn += 1
                             self.write_to_log(
                                 f"False Negative Flow: {self.flow}"
diff --git a/slips_files/core/structures/labels.py b/slips_files/core/structures/labels.py
new file mode 100644
index 0000000000..b1dc64234e
--- /dev/null
+++ b/slips_files/core/structures/labels.py
@@ -0,0 +1,11 @@
+from enum import Enum
+
+
+class Label(Enum):
+    """
+    label of flows should be one of the following
+    """
+
+    MALICIOUS = "Malicious"
+    BENIGN = "Benign"
+    BACKGROUND = "Background"

From 705f63d56c98f536e52a1b0cd0c02836c14aa4b4 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 233/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++-
 1 file changed, 150 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e44ac83f4d..16b67e9038 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -120,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -132,7 +268,7 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # For now, discard the ports
+            # For now, discard these
             to_drop = [
                 "appproto",
                 "daddr",
@@ -155,15 +291,25 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # So transform here
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
+
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
-                r"(^.*NotEstablished.*$)", "0", regex=True
+                r"(^.*Not Established.*$)", "0", regex=True
             )
             dataset.state = dataset.state.str.replace(
                 r"(^.*Established.*$)", "1", regex=True
             )
-            # Convert proto to categorical. For now we only have few states,
-            # so we can hardcode...
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
             # We dont use the data to create categories because in testing mode
             # we dont see all the protocols
             # Also we dont store the Categorizer because the user can retrain

From b690ea70e919e7ca95227684396e811a349dd771 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 234/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 00415c7c2bdf9900eee91682602db8ff609ec19d Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 235/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From f2de4e978cc9755565a87f168ee6d7c2cbd4abba Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 236/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 16b67e9038..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -291,12 +156,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 0b805976df..3a7f783ea7 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From bfc1221692fc0d0e8d72ad157f2eeff254706cc5 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 237/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From e9c16da10372297e2c4258b11dd94f02475c6f2d Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 238/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From ff289cbf8018779acd8a4ab08a8448223e5a24b8 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 239/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 31f5e9c653792a09dfe8ce215e1f57b0b2e71e59 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 240/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 777c76da4098c59526bbce25139ed973129a8460 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 241/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 8c7df7c47300cc7f1507a71b98d3252cb10dcb4e Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 242/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 25d09337a3341a8831684f00875d9e32bba520c4 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 243/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From e140a0c122398fc669668f26ae5d808d9ea662a8 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 244/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 104379e99f054bc8b99813a428c62b05c7b6181a Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 245/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 22244a7ec594088f70514e5efef966d20732d064 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 246/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From f06b6a3ff035031735ec93a106d4ea0a4315d50e Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:23:29 +0100
Subject: [PATCH 247/455] Re add function that alya added

---
 slips_files/core/database/database_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 3a7f783ea7..0b805976df 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
+    def get_final_state_from_flags(self, *args, **kwargs):
+        return self.rdb.get_final_state_from_flags(*args, **kwargs)
+
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 9e0355a012f073928a7edcb388701a0e7e26748c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:25:03 +0100
Subject: [PATCH 248/455] delete sys

---
 modules/flowmldetection/flowmldetection.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..c06755a599 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,7 +10,6 @@
 import json
 import traceback
 import warnings
-import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils

From c98a3cd4ea7da549834fee1a3d5d34c33f068266 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:27:23 +0100
Subject: [PATCH 249/455] Delete file that was deleted from develop

---
 slips_files/common/state_handler.py | 170 ----------------------------
 1 file changed, 170 deletions(-)
 delete mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
deleted file mode 100644
index d0a05115bd..0000000000
--- a/slips_files/common/state_handler.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Optional
-
-
-def interpret_suricata_states(state) -> Optional[str]:
-    """
-    There are different states in which a flow can be.
-    Suricata distinguishes three flow-states for TCP and two for
-     UDP. For TCP,
-    these are: New, Established and Closed,for UDP only new and
-    established.
-    For each of these states Suricata can employ different timeouts.
-    """
-    if "new" in state or "established" in state:
-        return "Established"
-    elif "closed" in state:
-        return "Not Established"
-
-
-def interpret_zeek_states(state) -> Optional[str]:
-    # We have varius type of states depending on the type of flow.
-    # For Zeek
-    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-        return "Not Established"
-    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-        return "Established"
-
-
-def interpret_argus_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
-    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-        """
-        Examples:
-        SA_SA
-        SR_SA
-        FSRA_SA
-        SPA_SPA
-        SRA_SPA
-        FSA_FSA
-        FSA_FSPA
-        SAEC_SPA
-        SRPA_SPA
-        FSPA_SPA
-        FSRPA_SPA
-        FSPA_FSPA
-        FSRA_FSPA
-        SRAEC_SPA
-        FSPA_FSRPA
-        FSAEC_FSPA
-        FSRPA_FSPA
-        SRPAEC_SPA
-        FSPAEC_FSPA
-        SRPAEC_FSRPA
-        """
-        return "Established"
-    elif "PA" in pre and "PA" in suf:
-        # Tipical flow that was reported in the middle
-        """
-        Examples:
-        PA_PA
-        FPA_FPA
-        """
-        return "Established"
-    elif "ECO" in pre:
-        return "ICMP Echo"
-    elif "ECR" in pre:
-        return "ICMP Reply"
-    elif "URH" in pre:
-        return "ICMP Host Unreachable"
-    elif "URP" in pre:
-        return "ICMP Port Unreachable"
-    else:
-        """
-        Examples:
-        S_RA
-        S_R
-        A_R
-        S_SA
-        SR_SA
-        FA_FA
-        SR_RA
-        SEC_RA
-        """
-        return "Not Established"
-
-
-def interpret_tcp_states(state, pkts) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "EST" in pre:
-        # TCP
-        return "Established"
-    elif "RST" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are reseted when finished and therefore are
-        # established
-        # It can happen that is reseted being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    elif "FIN" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are finished with FIN when finished and
-        # therefore are established
-        # It can happen that is finished being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    else:
-        """
-        Examples:
-        S_
-        FA_
-        PA_
-        FSA_
-        SEC_
-        SRPA_
-        """
-        return "Not Established"
-
-
-def interpret_udp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "CON" in pre:
-        # UDP
-        return "Established"
-    elif "INT" in pre:
-        # UDP trying to connect, NOT preciselly not established but also
-        # NOT 'Established'. So we considered not established because there
-        # is no confirmation of what happened.
-        return "Not Established"
-
-
-def interpret_icmp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "ECO" in pre:
-        # ICMP
-        return "Established"
-    elif "UNK" in pre:
-        # ICMP6 unknown upper layer
-        return "Established"
-
-
-def get_final_state_from_flags(state, pkts) -> str:
-    """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
-    """
-
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
-
-    return "Not Established"

From 1a133431aba6f1a40e525206cc3ea14749136ffd Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:32:01 +0100
Subject: [PATCH 250/455] Flowmldetection. Fix missing db reference

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c06755a599..87e07c7592 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -160,7 +160,7 @@ def process_features(self, dataset):
             # 'Not Established', it is still 'S0' and others
             # So transform here
             dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
+                lambda row: self.db.get_final_state_from_flags(
                     row["state"], row["pkts"]
                 ),
                 axis=1,

From b7af797fc757d7e3cbfc2317edc7381e5ee1e203 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:08 +0100
Subject: [PATCH 251/455] Fix the training of flows with ML in new version

---
 modules/flowmldetection/flowmldetection.py | 144 +++++++++++----------
 1 file changed, 77 insertions(+), 67 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 87e07c7592..e91495d649 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -55,8 +55,12 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new lables needed to trigger the train
-        self.minimum_lables_to_retrain = 50
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained
+        self.last_number_of_flows_when_trained = 0
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
@@ -67,26 +71,25 @@ def init(self):
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Process the labels to have only Normal and Malware
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*ormal.*$)", "Normal", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alware.*$)", "Malware", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alicious.*$)", "Malware", regex=True
-            )
+            # Get the flows from the DB
+            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
+            # Convert to pandas df
+            # self.flows = pd.DataFrame(self.flows)
+            # Process the features
+            # X_flow = self.process_features(self.flows)
 
-            # Separate
-            y_flow = self.flows["label"]
+            # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("label", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.label)
+            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -95,7 +98,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malware", "Normal"]
+                    X_flow, y_flow, classes=["Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -118,7 +121,7 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train()", 0, 1)
+            self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -144,9 +147,7 @@ def process_features(self, dataset):
                 "history",
                 "uid",
                 "dir_",
-                "dbytes",
                 "endtime",
-                "bytes",
                 "flow_source",
             ]
             for field in to_drop:
@@ -161,11 +162,10 @@ def process_features(self, dataset):
             # So transform here
             dataset["state"] = dataset.apply(
                 lambda row: self.db.get_final_state_from_flags(
-                    row["state"], row["pkts"]
+                    row["state"], (row["spkts"] + row["dpkts"])
                 ),
                 axis=1,
             )
-            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -199,7 +199,11 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            fields_to_convert_to_flow = [
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
                 dataset.proto,
                 dataset.dport,
                 dataset.sport,
@@ -210,10 +214,10 @@ def process_features(self, dataset):
                 dataset.sbytes,
                 dataset.state,
             ]
-            for field in fields_to_convert_to_flow:
+            for field in fields_to_convert_to_float:
                 try:
                     field = field.astype("float64")
-                except ValueError:
+                except (ValueError, AttributeError):
                     pass
 
             return dataset
@@ -222,9 +226,9 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flows(self):
+    def process_training_flows(self):
         """
-        Process all the flwos in the DB
+        Process all the flows in the DB
         Store the pandas df in self.flows
         """
         try:
@@ -240,44 +244,48 @@ def process_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
+
+                # These flows should be in the same format as the ones in the DB. 
+                # Which means the satate is still SF, S0, etc.
                 flows.append(
                     {
-                        "ts": 1594417039.029793,
+                        "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 42764,
-                        "spkts": 37,
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
                         "sbytes": 25517,
+                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malware",
+                        "label": "Malicious",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malware"
+                            "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "ts": 1382355032.706468,
+                        "starttime": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 67696,
+                        "state": "SF",
                         "spkts": 1,
+                        "dpkts": 0,
                         "sbytes": 100,
+                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Normal",
+                        "label": "Benign",
                         "module_labels": {
-                            "flowalerts-long-connection": "Normal"
+                            "flowalerts-long-connection": "Benign"
                         },
                     }
                 )
@@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
         and returns the predection array
         """
         try:
-            given_x_flow = x_flow
             # clean the flow
             fields_to_drop = [
                 "label",
@@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "uid",
                 "history",
                 "dir_",
-                "dbytes",
-                "dpkts",
                 "endtime",
-                "bytes",
                 "flow_source",
                 "ground_truth_label",  # todo now we can use them
                 "detailed_ground_truth_label",
@@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             return pred
         except Exception as e:
             self.print(
-                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
             )
             self.print(traceback.format_exc(), 0, 1)
 
@@ -437,18 +441,16 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
+            # When a new flow arrives
             msg = json.loads(msg["data"])
-            twid = msg["twid"]
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
             self.flow = msg["flow"]
-            # these fields are expected in testing. update the original
+            # These following extra fields are expected in testing. update the original
             # flow dict to have them
             self.flow.update(
                 {
-                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
-                    # the flow["state"] is the origstate, we dont need that here
-                    # we need the interpreted state
                     "state": msg["interpreted_state"],
-                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
                     "label": msg["label"],
                     "module_labels": msg["module_labels"],
                 }
@@ -461,23 +463,31 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_lables_to_retrain
-                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain'
-                    # amount of labels
-                    # So for example we retrain every 100 labels and only when
-                    # we have at least 100 labels
-                    self.print(
-                        f"Training the model with the last group of "
-                        f"flows and labels. Total flows: {sum_labeled_flows}."
-                    )
-                    # Process all flows in the DB and make them ready
-                    # for pandas
-                    self.process_flows()
-                    # Train an algorithm
-                    self.train()
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows()
+                        # Train an algorithm
+                        self.train()
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
@@ -497,8 +507,8 @@ def main(self):
                         # and the label is diff from the prediction,
                         # print in debug mode
                         self.print(
-                            f"Report Prediction {pred[0]} for label"
-                            f' {label} flow {self.flow["saddr"]}:'
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} ->'
                             f' {self.flow["daddr"]}:'
                             f'{self.flow["dport"]}/'
@@ -506,9 +516,9 @@ def main(self):
                             0,
                             3,
                         )
-                    if pred[0] == "Malware":
+                    if pred[0] == "Malicious":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, twid)
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
                             f"Prediction {pred[0]} for label {label}"
                             f' flow {self.flow["saddr"]}:'

From 3faff9b5bd3aeb53c306324572e39e743f43272d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:29 +0100
Subject: [PATCH 252/455] Fix the profiler handler for cases of nan in state

---
 .../core/database/redis_db/profile_handler.py     | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index edbbf3a12f..4d91b43a98 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -395,7 +395,12 @@ def get_final_state_from_flags(self, state, pkts):
         We receive the pakets to distinguish some Reset connections
         """
         try:
-            pre = state.split("_")[0]
+            # In some flows the state is a nan
+            try:
+                pre = state.split("_")[0]
+            except AttributeError:
+                pre = ''
+
             try:
                 # Try suricata states
                 """
@@ -417,7 +422,11 @@ def get_final_state_from_flags(self, state, pkts):
                     return "Established"
 
                 # For Argus
-                suf = state.split("_")[1]
+            # In some flows the state is a nan
+                try:
+                    suf = state.split("_")[1]
+                except AttributeError:
+                    suf = ''
                 if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
                     """
                     Examples:
@@ -518,7 +527,7 @@ def get_final_state_from_flags(self, state, pkts):
         except Exception:
             exception_line = sys.exc_info()[2].tb_lineno
             self.print(
-                f"Error in getFinalStateFromFlags() in database.py line {exception_line}",
+                f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}",
                 0,
                 1,
             )

From 2e0603b2c8e0adb327bf5249a30d2894a7d02adb Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:20:42 +0100
Subject: [PATCH 253/455] slips.yaml. Update to have correct labels. By default
 test. Defaul training lbel is benign

---
 config/slips.yaml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/config/slips.yaml b/config/slips.yaml
index 02adc7f1b4..1b73e7b549 100644
--- a/config/slips.yaml
+++ b/config/slips.yaml
@@ -105,13 +105,12 @@ parameters:
   deletePrevdb: true
 
   # Set the label for all the flows that are being read.
-  # For now only normal and malware directly. No option for setting labels
-  # with a filter
+  # For now only Benign and Malicious (Capitalized)
   # The purpose is to be used in the training of ML models and to output
   # flows with labels for other tools.
-  # label: malicious
-  # label: unknown
-  label: normal
+  # label: Malicious
+  # label: Benign
+  label: Benign
   # If Zeek files are rotated or not to avoid running out of disk.
   # Zeek rotation is enabled by default when using an interface,
   # which means Slips will delete all Zeek log files after 1 day

From 6f2e3c3be24352300ad435be5734a92cb917ab52 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:21:21 +0100
Subject: [PATCH 254/455] First ipython to tst ML flow related models

---
 modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb

diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb
new file mode 100644
index 0000000000..d726cd2805
--- /dev/null
+++ b/modules/flowmldetection/flowmlanalysis.ipynb
@@ -0,0 +1,76 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Analysis of Flows with Machine Learning for Slips"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis of a fixed list of flows to try techniques and find parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy\n",
+    "from sklearn.linear_model import SGDClassifier\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "import pickle\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "import traceback\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "slips-new",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 9a91a801f64855f3d9dbb64a013160e7ebc97d2d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:22:38 +0100
Subject: [PATCH 255/455] flowml. If the dataset has one flow and that is
 deleted, then return empty fast.

---
 modules/flowmldetection/flowmldetection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e91495d649..58b4ce1e4c 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -135,6 +135,11 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
             # For now, discard these
             to_drop = [
                 "appproto",

From b7c55c1fb89e829950ff3f1e4075135f92eb0f8d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:23:05 +0100
Subject: [PATCH 256/455] flowml. If the datasert is empty. Return none

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 58b4ce1e4c..4a4d46e376 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
             # Update the flow to the processed version
             return dflow
         except Exception:

From 1336ced589060f2382bfdcc41b883aab7cff2530 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:26:42 +0100
Subject: [PATCH 257/455] profile_handler. Small bug in how we handled the
 profiles, we were using 'in' instead of == for established. Some not
 established MAY not have been correctly captured

---
 slips_files/core/database/redis_db/profile_handler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index 4d91b43a98..a6669c92a9 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -409,9 +409,10 @@ def get_final_state_from_flags(self, state, pkts):
                 these are: New, Established and Closed,for UDP only new and established.
                 For each of these states Suricata can employ different timeouts.
                 """
-                if "new" in state or "established" in state:
+                # This is controversial, but if we dont have a good state, we consider it not established for now
+                if "new" in state or state.lower() == "established":
                     return "Established"
-                elif "closed" in state:
+                elif "closed" in state or state.lower() == 'not established':
                     return "Not Established"
 
                 # We have varius type of states depending on the type of flow.
@@ -422,7 +423,6 @@ def get_final_state_from_flags(self, state, pkts):
                     return "Established"
 
                 # For Argus
-            # In some flows the state is a nan
                 try:
                     suf = state.split("_")[1]
                 except AttributeError:

From 9dc77cd61c1b6431af32903d5003111405945ff3 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:27:16 +0100
Subject: [PATCH 258/455] First new version of the model and scaler. Not good
 yet, but working.

---
 modules/flowmldetection/model.bin  | Bin 1073 -> 1090 bytes
 modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644
GIT binary patch
delta 411
zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice!
z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWp<G
zkd+9;IY10jlMJQvfpjSl7X$GWpm9B7@u?LBiIY2+gw>QXRJ|Emr(~!@tY-9P0&;YE
zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1
z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8
zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q
b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5

delta 380
zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ
z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o
zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF
za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN
zW*@<QMemXQ6rg!MT$y<#sYS`D1tsxQGBDi*^nz8VGl$HwGgBKKcpSi5_$D7__GNWU
Y$oltSvM7r(H=BW;k%^wsl+q+U0L_+r^#A|>

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644
GIT binary patch
literal 890
zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD<
z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*<Bn8N3NzN~*1k!?Osfi_}MXB+5
ziMgr8@tJw?Q+h=6N^=V;^^)_8QuT66b4oH3i;5B}r}Xf|7o{fW=M|R}l_r+}?dV|z
zE1c594ze4hWbzbmhSn+0j7d}4rvy#W@MiSpZk>|B)5DrlQdy7+(!%6#F{QHuBFo&v
zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n
zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ
zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~
zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E;
z<KX(aCF;No0f(>8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k<U{t7-@NM2t?O`b(oOJr
z^gz^M3Md#@$`XrkInNqs`NcqG+rqF4hjd*#=9SUs?C-J63^i@q?kIRi^+W#GZH^`;
z_s<+FuX0#b@Z7FWCEbCm^j(f1Uy0+U$5+2R=gD#Gag=$mqjtZ2f~w@S(sxacYyTMt
zX(&vBSPl-{RD9NRUiR6)W4EzGNqG>5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp
z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg
dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_

delta 525
zcmeyxHj9<Dfn{nx(?r(zNWQ%IwA93s(xTMj_{_ZcDc+nt{CV-kiMa(iK#}6qRG?Uo
zXkKY<L8V@Deo?AkZfQ<QW@1rMV&#+`w&LW(oW!Cjlc)5sf|UU2l#<GVRG`S@UPd*o
zK3xVdn9?>S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq
z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5m<hihBT2gkmB>hSAF{H=RwvmLf>
z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG
zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%Yf<K@j$
z%OkDN+J7s4Zlk~8nEkR&OOX||2OO>Q^>gH&y>a4SnPazubB5!cqc1YnCGT<q+XfD^
zRFHk>))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ
z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4<MO)ScbFH0>d&dkpP
QMw@}2k%^wsl+q+U0P2|NZU6uP


From 12e3d93823589e3314325b158b8becc66e8d5d21 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:06 +0100
Subject: [PATCH 259/455] model and scaler with 1 malicious and 1 benign

---
 modules/flowmldetection/model.bin  | Bin 1090 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644
GIT binary patch
delta 132
zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSY<wU!LGn&}gUNx$3tLYnGl@A&e4S}J
zBSY*!uI-OaTr&&~Tw#uuJJhD?&}O=|yn5R|`<q--?Rc;A9e69=F=gK7d-lipbZtG?
j>K$N!g2|oC+8oRs(l<p`Iv<$4hdGq_%F%|&A}o3UHQY43

delta 131
zcmV-}0DS-C2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
lK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162cym;jSD1TGrII8guq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index bfba4d107224e5e6e5a1e8c8f4d463b48131d111..758909b289238ff282b2e056a9b3e83768b8472a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi%8H3jhEB

delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC


From 64fb2201dd47afe364cfe483a3f6898220fd726c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:27 +0100
Subject: [PATCH 260/455] cleaner jupyter

---
 modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4a4d46e376..d8e9ada27c 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "ground_truth_label",  # todo now we can use them
                 "detailed_ground_truth_label",
             ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)

From 83a9128ed9c44fbf9d55c05523a627a97bd60766 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 22:26:27 +0100
Subject: [PATCH 261/455] New models after 3rd ttrain

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644
GIT binary patch
delta 99
zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~Cof{2$owSaz+^KPJplbX
BD~|vG

delta 99
zcmV-p0G$8i2;>N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q;
zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG
F1TN2UERg^J

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644
GIT binary patch
delta 43
zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G|lbQl%0g;md19^W6
B5HkP(

delta 43
zcmV+`0M!5b2KolDfdU!a4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G}lbQl%0g#gc19^VR
B5HSD%


From 35c0a9f830c1744f4b348d6e1253bec731a7726b Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 00:08:50 +0100
Subject: [PATCH 262/455] Models after 4th train

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644
GIT binary patch
delta 120
zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
as{<<t5IC<!D(NIZlimYX0{noJHUuueR5UOE

delta 120
zcmaFD@q}YTFmr_2vYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~;rz*~nYB5XJEU)ltaLsw
P`5to^Q^<kIW-NLDlQb|V

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 821344a0c69d116622b02e2a0daa1554cb5d308e..29df65342047c5a499ee3f8e602d1f47cb7e9fca 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8kA
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi>ty4FCWD


From 71b93a508e1d8d625fb51ae4a698360044f2af34 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 08:28:59 +0100
Subject: [PATCH 263/455] Models of ml flow with the first good performance in
 small tests

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644
GIT binary patch
delta 121
zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1();
zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y<G(@KnDch(0mn<s!fm-ctwS<Lp0^
bs{<<tAstXx>?b6^limYW1McwvlQsk{8#y@u

delta 121
zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
bs{<<t5IC<!D(NIZlimYW1LXXGlQsk{_eeDq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 29df65342047c5a499ee3f8e602d1f47cb7e9fca..17115724b9536f6093f9d72f3b58a5c22c562a9a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK

delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T


From 2c70aa760e24cc16268efd553a3f94747b12a15e Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 264/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 317 +++++++++++++--------
 1 file changed, 206 insertions(+), 111 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index d8e9ada27c..8917fef6a5 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -55,12 +56,8 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new labels needed to start the train
-        self.minimum_labels_to_start_train = 50
-        # Minum amount of new labels needed to retrain
-        self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained
-        self.last_number_of_flows_when_trained = 0
+        # Minum amount of new lables needed to trigger the train
+        self.minimum_lables_to_retrain = 50
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
@@ -71,25 +68,26 @@ def init(self):
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
-        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Get the flows from the DB
-            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
-            # Convert to pandas df
-            # self.flows = pd.DataFrame(self.flows)
-            # Process the features
-            # X_flow = self.process_features(self.flows)
+            # Process the labels to have only Normal and Malware
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*ormal.*$)", "Normal", regex=True
+            )
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*alware.*$)", "Malware", regex=True
+            )
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*alicious.*$)", "Malware", regex=True
+            )
 
-            # Create X_flow with the current flows minus the label
+            # Separate
+            y_flow = self.flows["label"]
             X_flow = self.flows.drop("label", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.label)
-            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -98,7 +96,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malicious", "Benign"]
+                    X_flow, y_flow, classes=["Malware", "Normal"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -121,7 +119,142 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train().", 0, 1)
+            self.print("Error in train()", 0, 1)
+            self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -135,11 +268,6 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
-            if dataset.empty:
-                # DataFrame is empty now, so return empty
-                return dataset
-
             # For now, discard these
             to_drop = [
                 "appproto",
@@ -152,7 +280,9 @@ def process_features(self, dataset):
                 "history",
                 "uid",
                 "dir_",
+                "dbytes",
                 "endtime",
+                "bytes",
                 "flow_source",
             ]
             for field in to_drop:
@@ -161,16 +291,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: self.db.get_final_state_from_flags(
-                    row["state"], (row["spkts"] + row["dpkts"])
-                ),
-                axis=1,
-            )
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -204,11 +330,7 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-
-            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
-            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
-
-            fields_to_convert_to_float = [
+            fields_to_convert_to_flow = [
                 dataset.proto,
                 dataset.dport,
                 dataset.sport,
@@ -219,10 +341,10 @@ def process_features(self, dataset):
                 dataset.sbytes,
                 dataset.state,
             ]
-            for field in fields_to_convert_to_float:
+            for field in fields_to_convert_to_flow:
                 try:
                     field = field.astype("float64")
-                except (ValueError, AttributeError):
+                except ValueError:
                     pass
 
             return dataset
@@ -231,9 +353,9 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_training_flows(self):
+    def process_flows(self):
         """
-        Process all the flows in the DB
+        Process all the flwos in the DB
         Store the pandas df in self.flows
         """
         try:
@@ -249,48 +371,44 @@ def process_training_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-
-                # These flows should be in the same format as the ones in the DB. 
-                # Which means the satate is still SF, S0, etc.
+                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
                 flows.append(
                     {
-                        "starttime": 1594417039.029793,
+                        "ts": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 17,
-                        "dpkts": 27,
+                        "state": "Established",
+                        "allbytes": 42764,
+                        "spkts": 37,
                         "sbytes": 25517,
-                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malicious",
+                        "label": "Malware",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
+                            "flowalerts-long-connection": "Malware"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "starttime": 1382355032.706468,
+                        "ts": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "state": "SF",
+                        "state": "Established",
+                        "allbytes": 67696,
                         "spkts": 1,
-                        "dpkts": 0,
                         "sbytes": 100,
-                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Benign",
+                        "label": "Normal",
                         "module_labels": {
-                            "flowalerts-long-connection": "Benign"
+                            "flowalerts-long-connection": "Normal"
                         },
                     }
                 )
@@ -318,8 +436,6 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
-            if dflow.empty:
-                return None
             # Update the flow to the processed version
             return dflow
         except Exception:
@@ -333,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
         and returns the predection array
         """
         try:
+            given_x_flow = x_flow
             # clean the flow
             fields_to_drop = [
                 "label",
@@ -340,28 +457,12 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "uid",
                 "history",
                 "dir_",
+                "dbytes",
+                "dpkts",
                 "endtime",
+                "bytes",
                 "flow_source",
-                "ground_truth_label",  # todo now we can use them
-                "detailed_ground_truth_label",
             ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
-            # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
-
-            # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)
@@ -373,7 +474,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             return pred
         except Exception as e:
             self.print(
-                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
             )
             self.print(traceback.format_exc(), 0, 1)
 
@@ -465,16 +566,18 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            # When a new flow arrives
             msg = json.loads(msg["data"])
-            self.twid = msg["twid"]
-            self.profileid = msg["profileid"]
+            twid = msg["twid"]
             self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
+            # these fields are expected in testing. update the original
             # flow dict to have them
             self.flow.update(
                 {
+                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
+                    # the flow["state"] is the origstate, we dont need that here
+                    # we need the interpreted state
                     "state": msg["interpreted_state"],
+                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
                     "label": msg["label"],
                     "module_labels": msg["module_labels"],
                 }
@@ -487,31 +590,23 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
-
-                # The min labels to retrain is the min number of flows 
-                # we should have seen so far in this capture to start training
-                # This is so we dont _start_ training with only 1 flow
-
-                # Once we are over the start minimum, the second condition is 
-                # to force to retrain every a minimum_labels_to_retrain number
-                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
+                    sum_labeled_flows >= self.minimum_lables_to_retrain
+                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
                 ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
-                        # So for example we retrain every 50 labels and only when
-                        # we have at least 50 labels
-                        self.print(
-                            f"Training the model with the last group of "
-                            f"flows and labels. Total flows: {sum_labeled_flows}."
-                        )
-                        # Process all flows in the DB and make them ready
-                        # for pandas
-                        self.process_training_flows()
-                        # Train an algorithm
-                        self.train()
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
-
+                    # We get here every 'self.minimum_lables_to_retrain'
+                    # amount of labels
+                    # So for example we retrain every 100 labels and only when
+                    # we have at least 100 labels
+                    self.print(
+                        f"Training the model with the last group of "
+                        f"flows and labels. Total flows: {sum_labeled_flows}."
+                    )
+                    # Process all flows in the DB and make them ready
+                    # for pandas
+                    self.process_flows()
+                    # Train an algorithm
+                    self.train()
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
@@ -531,8 +626,8 @@ def main(self):
                         # and the label is diff from the prediction,
                         # print in debug mode
                         self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f"Report Prediction {pred[0]} for label"
+                            f' {label} flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} ->'
                             f' {self.flow["daddr"]}:'
                             f'{self.flow["dport"]}/'
@@ -540,9 +635,9 @@ def main(self):
                             0,
                             3,
                         )
-                    if pred[0] == "Malicious":
+                    if pred[0] == "Malware":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, self.twid)
+                        self.set_evidence_malicious_flow(self.flow, twid)
                         self.print(
                             f"Prediction {pred[0]} for label {label}"
                             f' flow {self.flow["saddr"]}:'

From e04e6c61fe8584afe0247f8b21fe2b865cdafe71 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 265/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 8a30e90ccdcecc165d280d5f47bde3d370fabe00 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 266/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 3c7af271be30bc4b2a1f8fdf466941f9bfa5b5a9 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 267/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 8917fef6a5..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -291,12 +156,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 0b805976df..3a7f783ea7 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 561049fd9988c8435cff5ac5027e3602c2409088 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 268/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..12c3589edc 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From da9a6b009a0cf1899f2739b9061558ff730ca3b6 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 269/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 12c3589edc..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 48b4255302ec79e0d4a9e675b42f08721411e34d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 270/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..12c3589edc 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 5be432f747eccfde0a25cf4d9f97cf6996fff206 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 271/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 12c3589edc..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 43f078f96a223cb031b6973dc4c0f4dcb34ac76b Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 272/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..12c3589edc 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 6be1da4f70112a4bf1a49010dfbbf0123e2936bd Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 273/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 12c3589edc..fb17b57f23 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 4c52dd2a3fff6acfaa6e4c51593818fbedf73a39 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 274/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 163 +++++++++++++++++++--
 1 file changed, 149 insertions(+), 14 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fb17b57f23..c8226368c7 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -8,6 +8,7 @@
 import pickle
 import pandas as pd
 import json
+import datetime
 import traceback
 import warnings
 import sys
@@ -121,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -156,17 +292,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
-                    row["state"], row["pkts"]
-                ),
-                axis=1,
-            )
-            # dataset.state = new_state_column
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -393,21 +524,25 @@ def read_model(self):
     def set_evidence_malicious_flow(self, flow: dict, twid: str):
         confidence: float = 0.1
         description = (
-            f"Flow with malicious characteristics by ML. Src IP"
+            f"Malicious flow by ML. Src IP"
             f" {flow['saddr']}:{flow['sport']} to "
             f"{flow['daddr']}:{flow['dport']}"
         )
+
+        timestamp = utils.convert_format(
+            datetime.datetime.now(), utils.alerts_format
+        )
         twid_number = int(twid.replace("timewindow", ""))
         evidence: Evidence = Evidence(
             evidence_type=EvidenceType.MALICIOUS_FLOW,
             attacker=Attacker(
                 direction=Direction.SRC,
-                ioc_type=IoCType.IP,
+                attacker_type=IoCType.IP,
                 value=flow["saddr"],
             ),
             victim=Victim(
                 direction=Direction.DST,
-                ioc_type=IoCType.IP,
+                victim_type=IoCType.IP,
                 value=flow["daddr"],
             ),
             threat_level=ThreatLevel.LOW,
@@ -416,7 +551,7 @@ def set_evidence_malicious_flow(self, flow: dict, twid: str):
             profile=ProfileID(ip=flow["saddr"]),
             timewindow=TimeWindow(twid_number),
             uid=[flow["uid"]],
-            timestamp=flow["starttime"],
+            timestamp=timestamp,
             method=Method.AI,
             src_port=flow["sport"],
             dst_port=flow["dport"],

From 0b646faa189b0097648fb7283e91121aa211f19f Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 275/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 67 ++++++++++++++++-------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index d0a05115bd..b671a09a28 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,7 +1,9 @@
 from typing import Optional
+import sys
+import traceback
 
 
-def interpret_suricata_states(state) -> Optional[str]:
+def check_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_zeek_states(state) -> Optional[str]:
+def check_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def interpret_argus_states(state) -> Optional[str]:
+def check_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
+    suf = state.split("_")[1]
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_tcp_states(state, pkts) -> Optional[str]:
+def check_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_udp_states(state) -> Optional[str]:
+def check_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_icmp_states(state) -> Optional[str]:
+def check_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(state, pkts) -> str:
+def get_final_state_from_flags(self, state, pkts) -> str:
     """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
     """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
 
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
+        return "Not Established"
 
-    return "Not Established"
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From a477d089a3d8dd0391bb34de0261d7dafe23af2a Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 276/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From a74d1c5c6fc38842a6a3143ba91e8aae0c4c8599 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 277/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 150 ++-------------------
 1 file changed, 10 insertions(+), 140 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c8226368c7..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -292,12 +157,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(

From 560a37b8ef1724010ec2f653ab6e686efbfe9fdb Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 278/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..94eb27afdf 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 5190917ba7031d744def42bf9d0d1510a59746cc Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 279/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 94eb27afdf..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 567f4393ad7832b554e8684c026fad71fe6d0b3e Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 280/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 150 +++++++++++++++++++--
 1 file changed, 140 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..c8226368c7 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -157,17 +292,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
-                    row["state"], row["pkts"]
-                ),
-                axis=1,
-            )
-            # dataset.state = new_state_column
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(

From 626a5c3d5bb9f9cb94d5b1d91f4c61c4913247a1 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 281/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 67 ++++++++++++++++-------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index d0a05115bd..b671a09a28 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,7 +1,9 @@
 from typing import Optional
+import sys
+import traceback
 
 
-def interpret_suricata_states(state) -> Optional[str]:
+def check_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_zeek_states(state) -> Optional[str]:
+def check_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def interpret_argus_states(state) -> Optional[str]:
+def check_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
+    suf = state.split("_")[1]
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_tcp_states(state, pkts) -> Optional[str]:
+def check_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_udp_states(state) -> Optional[str]:
+def check_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_icmp_states(state) -> Optional[str]:
+def check_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(state, pkts) -> str:
+def get_final_state_from_flags(self, state, pkts) -> str:
     """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
     """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
 
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
+        return "Not Established"
 
-    return "Not Established"
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 2c2212290619b7bccb25ef045f3a2ba3f4f5a270 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 282/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 6bed5ff1a0bef41b33a1cd5b07dcf89cb2a43ab6 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 283/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 150 ++-------------------
 1 file changed, 10 insertions(+), 140 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c8226368c7..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -292,12 +157,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(

From d5ea6803c87520eee8061d06dfce7a75159238b3 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 284/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 438 +++++++++++++--------
 1 file changed, 278 insertions(+), 160 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..124ec61f91 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,8 +1,3 @@
-# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
-from typing import Optional
-
-# SPDX-License-Identifier: GPL-2.0-only
-import numpy
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler
 import pickle
@@ -10,13 +5,10 @@
 import json
 import datetime
 import traceback
-import warnings
 import sys
 
-from slips_files.common.parsers.config_parser import ConfigParser
-from slips_files.common.slips_utils import utils
-from slips_files.common.abstracts.module import IModule
-from slips_files.core.structures.evidence import (
+from slips_files.common.imports import *
+from slips_files.core.evidence_structure.evidence import (
     Evidence,
     ProfileID,
     TimeWindow,
@@ -25,8 +17,7 @@
     EvidenceType,
     IoCType,
     Direction,
-    Victim,
-    Method,
+    IDEACategory,
 )
 
 # Only for debbuging
@@ -38,6 +29,8 @@ def warn(*args, **kwargs):
     pass
 
 
+import warnings
+
 warnings.warn = warn
 
 
@@ -63,8 +56,6 @@ def init(self):
         # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
-        self.model_path = "./modules/flowmldetection/model.bin"
-        self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
     def read_configuration(self):
         conf = ConfigParser()
@@ -122,6 +113,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -130,7 +256,7 @@ def process_features(self, dataset):
         """
         try:
             # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"]
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
@@ -139,35 +265,28 @@ def process_features(self, dataset):
                 "appproto",
                 "daddr",
                 "saddr",
-                "starttime",
+                "ts",
+                "origstate",
                 "type_",
-                "smac",
-                "dmac",
-                "history",
-                "uid",
                 "dir_",
+                "history",
                 "dbytes",
-                "endtime",
-                "bytes",
-                "flow_source",
+                "dpkts",
+                "smac",
+                "dmac",
             ]
             for field in to_drop:
                 try:
                     dataset = dataset.drop(field, axis=1)
-                except (ValueError, KeyError):
+                except ValueError:
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
-                    row["state"], row["pkts"]
-                ),
-                axis=1,
-            )
-            # dataset.state = new_state_column
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -201,23 +320,42 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            fields_to_convert_to_flow = [
-                dataset.proto,
-                dataset.dport,
-                dataset.sport,
-                dataset.dur,
-                dataset.pkts,
-                dataset.spkts,
-                dataset.allbytes,
-                dataset.sbytes,
-                dataset.state,
-            ]
-            for field in fields_to_convert_to_flow:
-                try:
-                    field = field.astype("float64")
-                except ValueError:
-                    pass
-
+            dataset.proto = dataset.proto.astype("float64")
+            try:
+                # Convert dport to float
+                dataset.dport = dataset.dport.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert sport to float
+                dataset.sport = dataset.sport.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert Dur to float
+                dataset.dur = dataset.dur.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert TotPkts to float
+                dataset.pkts = dataset.pkts.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert SrcPkts to float
+                dataset.spkts = dataset.spkts.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert TotBytes to float
+                dataset.allbytes = dataset.allbytes.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert SrcBytes to float
+                dataset.sbytes = dataset.sbytes.astype("float")
+            except ValueError:
+                pass
             return dataset
         except Exception:
             # Stop the timer
@@ -233,6 +371,7 @@ def process_flows(self):
             # We get all the flows so far
             # because this retraining happens in batches
             flows = self.db.get_all_flows()
+
             # Check how many different labels are in the DB
             # We need both normal and malware
             labels = self.db.get_labels()
@@ -252,7 +391,9 @@ def process_flows(self):
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
+                        "origstate": "SRPA_SPA",
                         "state": "Established",
+                        "pkts": 84,
                         "allbytes": 42764,
                         "spkts": 37,
                         "sbytes": 25517,
@@ -272,7 +413,9 @@ def process_flows(self):
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
+                        "origstate": "SRPA_SPA",
                         "state": "Established",
+                        "pkts": 67,
                         "allbytes": 67696,
                         "spkts": 1,
                         "sbytes": 100,
@@ -298,55 +441,42 @@ def process_flows(self):
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flow(self, flow_to_process: dict):
+    def process_flow(self):
         """
         Process one flow. Only used during detection in testing
-        returns the pandas df with the processed flow
+        Store the pandas df in self.flow
         """
         try:
             # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(flow_to_process, index=[0])
+            raw_flow = pd.DataFrame(self.flow_dict, index=[0])
+            # Process features
             dflow = self.process_features(raw_flow)
             # Update the flow to the processed version
-            return dflow
+            self.flow = dflow
         except Exception:
             # Stop the timer
             self.print("Error in process_flow()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def detect(self, x_flow) -> Optional[numpy.ndarray]:
+    def detect(self):
         """
-        Detects the given flow with the current model stored
-        and returns the predection array
+        Detect this flow with the current model stored
         """
         try:
-            given_x_flow = x_flow
-            # clean the flow
-            fields_to_drop = [
-                "label",
-                "module_labels",
-                "uid",
-                "history",
-                "dir_",
-                "dbytes",
-                "dpkts",
-                "endtime",
-                "bytes",
-                "flow_source",
-            ]
-            for field in fields_to_drop:
-                try:
-                    x_flow = x_flow.drop(field, axis=1)
-                except (KeyError, ValueError):
-                    pass
+            # Store the real label if there is one
+            y_flow = self.flow["label"]
+            # remove the real label column
+            self.flow = self.flow.drop("label", axis=1)
+            # remove the label predictions column of the other modules
+            X_flow = self.flow.drop("module_labels", axis=1)
             # Scale the flow
-            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
-            pred: numpy.ndarray = self.clf.predict(x_flow)
+            X_flow = self.scaler.transform(X_flow)
+            pred = self.clf.predict(X_flow)
             return pred
-        except Exception as e:
-            self.print(
-                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
-            )
+        except Exception:
+            # Stop the timer
+            self.print("Error in detect() X_flow:")
+            self.print(X_flow)
             self.print(traceback.format_exc(), 0, 1)
 
     def store_model(self):
@@ -354,10 +484,10 @@ def store_model(self):
         Store the trained model on disk
         """
         self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open(self.model_path, "wb") as f:
+        with open("./modules/flowmldetection/model.bin", "wb") as f:
             data = pickle.dumps(self.clf)
             f.write(data)
-        with open(self.scaler_path, "wb") as g:
+        with open("./modules/flowmldetection/scaler.bin", "wb") as g:
             data = pickle.dumps(self.scaler)
             g.write(data)
 
@@ -367,23 +497,20 @@ def read_model(self):
         """
         try:
             self.print("Reading the trained model from disk.", 0, 2)
-            with open(self.model_path, "rb") as f:
+            with open("./modules/flowmldetection/model.bin", "rb") as f:
                 self.clf = pickle.load(f)
             self.print("Reading the trained scaler from disk.", 0, 2)
-            with open(self.scaler_path, "rb") as g:
+            with open("./modules/flowmldetection/scaler.bin", "rb") as g:
                 self.scaler = pickle.load(g)
         except FileNotFoundError:
             # If there is no model, create one empty
-            self.print(
-                "There was no model. " "Creating a new empty model.", 0, 2
-            )
+            self.print("There was no model. Creating a new empty model.", 0, 2)
             self.clf = SGDClassifier(
                 warm_start=True, loss="hinge", penalty="l1"
             )
         except EOFError:
             self.print(
-                "Error reading model from disk. "
-                "Creating a new empty model.",
+                "Error reading model from disk. Creating a new empty model.",
                 0,
                 2,
             )
@@ -391,40 +518,39 @@ def read_model(self):
                 warm_start=True, loss="hinge", penalty="l1"
             )
 
-    def set_evidence_malicious_flow(self, flow: dict, twid: str):
+    def set_evidence_malicious_flow(
+        self,
+        saddr: str,
+        sport: str,
+        daddr: str,
+        dport: str,
+        twid: str,
+        uid: str,
+    ):
         confidence: float = 0.1
+        ip_identification = self.db.get_ip_identification(daddr)
         description = (
-            f"Malicious flow by ML. Src IP"
-            f" {flow['saddr']}:{flow['sport']} to "
-            f"{flow['daddr']}:{flow['dport']}"
+            f"Malicious flow by ML. Src IP {saddr}:{sport} to "
+            f"{daddr}:{dport} {ip_identification}"
         )
 
         timestamp = utils.convert_format(
             datetime.datetime.now(), utils.alerts_format
         )
-        twid_number = int(twid.replace("timewindow", ""))
+
         evidence: Evidence = Evidence(
             evidence_type=EvidenceType.MALICIOUS_FLOW,
             attacker=Attacker(
-                direction=Direction.SRC,
-                attacker_type=IoCType.IP,
-                value=flow["saddr"],
-            ),
-            victim=Victim(
-                direction=Direction.DST,
-                victim_type=IoCType.IP,
-                value=flow["daddr"],
+                direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr
             ),
             threat_level=ThreatLevel.LOW,
             confidence=confidence,
             description=description,
-            profile=ProfileID(ip=flow["saddr"]),
-            timewindow=TimeWindow(twid_number),
-            uid=[flow["uid"]],
+            profile=ProfileID(ip=saddr),
+            timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))),
+            uid=[uid],
             timestamp=timestamp,
-            method=Method.AI,
-            src_port=flow["sport"],
-            dst_port=flow["dport"],
+            category=IDEACategory.ANOMALY_TRAFFIC,
         )
 
         self.db.set_evidence(evidence)
@@ -441,22 +567,20 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            msg = json.loads(msg["data"])
-            twid = msg["twid"]
-            self.flow = msg["flow"]
-            # these fields are expected in testing. update the original
-            # flow dict to have them
-            self.flow.update(
-                {
-                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
-                    # the flow["state"] is the origstate, we dont need that here
-                    # we need the interpreted state
-                    "state": msg["interpreted_state"],
-                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
-                    "label": msg["label"],
-                    "module_labels": msg["module_labels"],
-                }
-            )
+            data = msg["data"]
+            # Convert from json to dict
+            data = json.loads(data)
+            profileid = data["profileid"]
+            twid = data["twid"]
+            # Get flow that is now in json format
+            flow = data["flow"]
+            # Convert flow to a dict
+            flow = json.loads(flow)
+            # Convert the common fields to something that can
+            # be interpreted
+            # Get the uid which is the key
+            uid = next(iter(flow))
+            self.flow_dict = json.loads(flow[uid])
 
             if self.mode == "train":
                 # We are training
@@ -469,57 +593,51 @@ def main(self):
                     sum_labeled_flows >= self.minimum_lables_to_retrain
                     and sum_labeled_flows % self.minimum_lables_to_retrain == 1
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain'
-                    # amount of labels
-                    # So for example we retrain every 100 labels and only when
-                    # we have at least 100 labels
+                    # We get here every 'self.minimum_lables_to_retrain' amount of labels
+                    # So for example we retrain every 100 labels and only when we have at least 100 labels
                     self.print(
-                        f"Training the model with the last group of "
-                        f"flows and labels. Total flows: {sum_labeled_flows}."
+                        f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}."
                     )
-                    # Process all flows in the DB and make them ready
-                    # for pandas
+                    # Process all flows in the DB and make them ready for pandas
                     self.process_flows()
                     # Train an algorithm
                     self.train()
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
-                processed_flow = self.process_flow(self.flow)
+                self.process_flow()
 
-                # After processing the flow, it may happen that we
-                # delete icmp/arp/etc so the dataframe can be empty
-                if processed_flow is not None and not processed_flow.empty:
+                # After processing the flow, it may happen that we delete icmp/arp/etc
+                # so the dataframe can be empty
+                if self.flow is not None and not self.flow.empty:
                     # Predict
-                    pred: numpy.ndarray = self.detect(processed_flow)
-                    if not pred:
-                        # an error occurred
-                        return
+                    pred = self.detect()
+                    label = self.flow_dict["label"]
 
-                    label = self.flow["label"]
+                    # Report
                     if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
+                        # If the user specified a label in test mode, and the label
+                        # is diff from the prediction, print in debug mode
                         self.print(
-                            f"Report Prediction {pred[0]} for label"
-                            f' {label} flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
+                            f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
+                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
+                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
                             0,
                             3,
                         )
                     if pred[0] == "Malware":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, twid)
+                        self.set_evidence_malicious_flow(
+                            self.flow_dict["saddr"],
+                            self.flow_dict["sport"],
+                            self.flow_dict["daddr"],
+                            self.flow_dict["dport"],
+                            twid,
+                            uid,
+                        )
                         self.print(
-                            f"Prediction {pred[0]} for label {label}"
-                            f' flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} -> '
-                            f'{self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
+                            f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
+                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
+                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
                             0,
                             2,
                         )

From 0e07e32ecc9922fb33f034bf05c3f8888b0938ab Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 285/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 67 ++++++++++++++++-------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index d0a05115bd..b671a09a28 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,7 +1,9 @@
 from typing import Optional
+import sys
+import traceback
 
 
-def interpret_suricata_states(state) -> Optional[str]:
+def check_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_zeek_states(state) -> Optional[str]:
+def check_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def interpret_argus_states(state) -> Optional[str]:
+def check_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
+    suf = state.split("_")[1]
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_tcp_states(state, pkts) -> Optional[str]:
+def check_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_udp_states(state) -> Optional[str]:
+def check_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def interpret_icmp_states(state) -> Optional[str]:
+def check_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(state, pkts) -> str:
+def get_final_state_from_flags(self, state, pkts) -> str:
     """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
     """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
 
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
+        return "Not Established"
 
-    return "Not Established"
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 000e8926166c4c4f4af17b8cf157bf2d37472950 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 286/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 0955f66abeb7f5e0f97459abc63d276730ab6868 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 287/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 169 +++------------------
 1 file changed, 19 insertions(+), 150 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 124ec61f91..c57a7a3581 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -5,9 +5,13 @@
 import json
 import datetime
 import traceback
-import sys
+import warnings
+
 
-from slips_files.common.imports import *
+from slips_files.common.state_handler import get_final_state_from_flags
+from slips_files.common.parsers.config_parser import ConfigParser
+from slips_files.common.slips_utils import utils
+from slips_files.common.abstracts.module import IModule
 from slips_files.core.evidence_structure.evidence import (
     Evidence,
     ProfileID,
@@ -29,8 +33,6 @@ def warn(*args, **kwargs):
     pass
 
 
-import warnings
-
 warnings.warn = warn
 
 
@@ -113,141 +115,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -281,12 +148,17 @@ def process_features(self, dataset):
                 except ValueError:
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -370,7 +242,7 @@ def process_flows(self):
         try:
             # We get all the flows so far
             # because this retraining happens in batches
-            flows = self.db.get_all_flows()
+            flows: list = self.db.get_all_flows()
 
             # Check how many different labels are in the DB
             # We need both normal and malware
@@ -464,7 +336,7 @@ def detect(self):
         """
         try:
             # Store the real label if there is one
-            y_flow = self.flow["label"]
+            # y_flow = self.flow["label"]
             # remove the real label column
             self.flow = self.flow.drop("label", axis=1)
             # remove the label predictions column of the other modules
@@ -568,13 +440,10 @@ def pre_main(self):
     def main(self):
         if msg := self.get_msg("new_flow"):
             data = msg["data"]
-            # Convert from json to dict
             data = json.loads(data)
-            profileid = data["profileid"]
+            # profileid = data["profileid"]
             twid = data["twid"]
-            # Get flow that is now in json format
             flow = data["flow"]
-            # Convert flow to a dict
             flow = json.loads(flow)
             # Convert the common fields to something that can
             # be interpreted

From 088d9270622d332b34eb39fe23d1e540257188b6 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:36:55 +0200
Subject: [PATCH 288/455] mlflow. Ignore UID column

---
 modules/flowmldetection/flowmldetection.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c57a7a3581..e2aa1e0ee3 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -141,6 +141,7 @@ def process_features(self, dataset):
                 "dpkts",
                 "smac",
                 "dmac",
+                "uid",
             ]
             for field in to_drop:
                 try:

From 51f5f2f76934d8add93b8ec09190317d421cdc93 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:23:29 +0100
Subject: [PATCH 289/455] Re add function that alya added

---
 slips_files/core/database/database_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 3a7f783ea7..0b805976df 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
+    def get_final_state_from_flags(self, *args, **kwargs):
+        return self.rdb.get_final_state_from_flags(*args, **kwargs)
+
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 38c5d55481cc57d81ccba540ffbb2d4811c39e6d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:27:23 +0100
Subject: [PATCH 290/455] Delete file that was deleted from develop

---
 slips_files/common/state_handler.py | 170 ----------------------------
 1 file changed, 170 deletions(-)
 delete mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
deleted file mode 100644
index d0a05115bd..0000000000
--- a/slips_files/common/state_handler.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Optional
-
-
-def interpret_suricata_states(state) -> Optional[str]:
-    """
-    There are different states in which a flow can be.
-    Suricata distinguishes three flow-states for TCP and two for
-     UDP. For TCP,
-    these are: New, Established and Closed,for UDP only new and
-    established.
-    For each of these states Suricata can employ different timeouts.
-    """
-    if "new" in state or "established" in state:
-        return "Established"
-    elif "closed" in state:
-        return "Not Established"
-
-
-def interpret_zeek_states(state) -> Optional[str]:
-    # We have varius type of states depending on the type of flow.
-    # For Zeek
-    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-        return "Not Established"
-    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-        return "Established"
-
-
-def interpret_argus_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
-    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-        """
-        Examples:
-        SA_SA
-        SR_SA
-        FSRA_SA
-        SPA_SPA
-        SRA_SPA
-        FSA_FSA
-        FSA_FSPA
-        SAEC_SPA
-        SRPA_SPA
-        FSPA_SPA
-        FSRPA_SPA
-        FSPA_FSPA
-        FSRA_FSPA
-        SRAEC_SPA
-        FSPA_FSRPA
-        FSAEC_FSPA
-        FSRPA_FSPA
-        SRPAEC_SPA
-        FSPAEC_FSPA
-        SRPAEC_FSRPA
-        """
-        return "Established"
-    elif "PA" in pre and "PA" in suf:
-        # Tipical flow that was reported in the middle
-        """
-        Examples:
-        PA_PA
-        FPA_FPA
-        """
-        return "Established"
-    elif "ECO" in pre:
-        return "ICMP Echo"
-    elif "ECR" in pre:
-        return "ICMP Reply"
-    elif "URH" in pre:
-        return "ICMP Host Unreachable"
-    elif "URP" in pre:
-        return "ICMP Port Unreachable"
-    else:
-        """
-        Examples:
-        S_RA
-        S_R
-        A_R
-        S_SA
-        SR_SA
-        FA_FA
-        SR_RA
-        SEC_RA
-        """
-        return "Not Established"
-
-
-def interpret_tcp_states(state, pkts) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "EST" in pre:
-        # TCP
-        return "Established"
-    elif "RST" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are reseted when finished and therefore are
-        # established
-        # It can happen that is reseted being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    elif "FIN" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are finished with FIN when finished and
-        # therefore are established
-        # It can happen that is finished being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    else:
-        """
-        Examples:
-        S_
-        FA_
-        PA_
-        FSA_
-        SEC_
-        SRPA_
-        """
-        return "Not Established"
-
-
-def interpret_udp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "CON" in pre:
-        # UDP
-        return "Established"
-    elif "INT" in pre:
-        # UDP trying to connect, NOT preciselly not established but also
-        # NOT 'Established'. So we considered not established because there
-        # is no confirmation of what happened.
-        return "Not Established"
-
-
-def interpret_icmp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "ECO" in pre:
-        # ICMP
-        return "Established"
-    elif "UNK" in pre:
-        # ICMP6 unknown upper layer
-        return "Established"
-
-
-def get_final_state_from_flags(state, pkts) -> str:
-    """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
-    """
-
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
-
-    return "Not Established"

From c15b430c419997b224a9ef1b4d5a8cd99195d0b8 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:32:01 +0100
Subject: [PATCH 291/455] Flowmldetection. Fix missing db reference

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e2aa1e0ee3..9269b67012 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -154,7 +154,7 @@ def process_features(self, dataset):
             # 'Not Established', it is still 'S0' and others
             # So transform here
             dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
+                lambda row: self.db.get_final_state_from_flags(
                     row["state"], row["pkts"]
                 ),
                 axis=1,

From dc2ced3b23a3dac2e11b8d71a3d3bb236d7a7703 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:08 +0100
Subject: [PATCH 292/455] Fix the training of flows with ML in new version

---
 modules/flowmldetection/flowmldetection.py | 378 +++++++++++----------
 1 file changed, 197 insertions(+), 181 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9269b67012..e6ea0b5171 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,18 +1,20 @@
+# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
+from typing import Optional
+
+# SPDX-License-Identifier: GPL-2.0-only
+import numpy
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler
 import pickle
 import pandas as pd
 import json
-import datetime
 import traceback
 import warnings
 
-
-from slips_files.common.state_handler import get_final_state_from_flags
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
 from slips_files.common.abstracts.module import IModule
-from slips_files.core.evidence_structure.evidence import (
+from slips_files.core.structures.evidence import (
     Evidence,
     ProfileID,
     TimeWindow,
@@ -21,7 +23,8 @@
     EvidenceType,
     IoCType,
     Direction,
-    IDEACategory,
+    Victim,
+    Method,
 )
 
 # Only for debbuging
@@ -52,36 +55,41 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new lables needed to trigger the train
-        self.minimum_lables_to_retrain = 50
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained
+        self.last_number_of_flows_when_trained = 0
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
+        self.model_path = "./modules/flowmldetection/model.bin"
+        self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Process the labels to have only Normal and Malware
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*ormal.*$)", "Normal", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alware.*$)", "Malware", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alicious.*$)", "Malware", regex=True
-            )
+            # Get the flows from the DB
+            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
+            # Convert to pandas df
+            # self.flows = pd.DataFrame(self.flows)
+            # Process the features
+            # X_flow = self.process_features(self.flows)
 
-            # Separate
-            y_flow = self.flows["label"]
+            # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("label", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.label)
+            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -90,7 +98,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malware", "Normal"]
+                    X_flow, y_flow, classes=["Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -113,7 +121,7 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train()", 0, 1)
+            self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -123,7 +131,7 @@ def process_features(self, dataset):
         """
         try:
             # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"]
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
@@ -132,21 +140,20 @@ def process_features(self, dataset):
                 "appproto",
                 "daddr",
                 "saddr",
-                "ts",
-                "origstate",
+                "starttime",
                 "type_",
-                "dir_",
-                "history",
-                "dbytes",
-                "dpkts",
                 "smac",
                 "dmac",
+                "history",
                 "uid",
+                "dir_",
+                "endtime",
+                "flow_source",
             ]
             for field in to_drop:
                 try:
                     dataset = dataset.drop(field, axis=1)
-                except ValueError:
+                except (ValueError, KeyError):
                     pass
 
             # When flows are read from Slips sqlite,
@@ -155,11 +162,10 @@ def process_features(self, dataset):
             # So transform here
             dataset["state"] = dataset.apply(
                 lambda row: self.db.get_final_state_from_flags(
-                    row["state"], row["pkts"]
+                    row["state"], (row["spkts"] + row["dpkts"])
                 ),
                 axis=1,
             )
-            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -193,58 +199,42 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            dataset.proto = dataset.proto.astype("float64")
-            try:
-                # Convert dport to float
-                dataset.dport = dataset.dport.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert sport to float
-                dataset.sport = dataset.sport.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert Dur to float
-                dataset.dur = dataset.dur.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert TotPkts to float
-                dataset.pkts = dataset.pkts.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert SrcPkts to float
-                dataset.spkts = dataset.spkts.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert TotBytes to float
-                dataset.allbytes = dataset.allbytes.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert SrcBytes to float
-                dataset.sbytes = dataset.sbytes.astype("float")
-            except ValueError:
-                pass
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
+                dataset.proto,
+                dataset.dport,
+                dataset.sport,
+                dataset.dur,
+                dataset.pkts,
+                dataset.spkts,
+                dataset.allbytes,
+                dataset.sbytes,
+                dataset.state,
+            ]
+            for field in fields_to_convert_to_float:
+                try:
+                    field = field.astype("float64")
+                except (ValueError, AttributeError):
+                    pass
+
             return dataset
         except Exception:
             # Stop the timer
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flows(self):
+    def process_training_flows(self):
         """
-        Process all the flwos in the DB
+        Process all the flows in the DB
         Store the pandas df in self.flows
         """
         try:
             # We get all the flows so far
             # because this retraining happens in batches
-            flows: list = self.db.get_all_flows()
-
+            flows = self.db.get_all_flows()
             # Check how many different labels are in the DB
             # We need both normal and malware
             labels = self.db.get_labels()
@@ -254,48 +244,48 @@ def process_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
+
+                # These flows should be in the same format as the ones in the DB. 
+                # Which means the satate is still SF, S0, etc.
                 flows.append(
                     {
-                        "ts": 1594417039.029793,
+                        "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "origstate": "SRPA_SPA",
-                        "state": "Established",
-                        "pkts": 84,
-                        "allbytes": 42764,
-                        "spkts": 37,
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
                         "sbytes": 25517,
+                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malware",
+                        "label": "Malicious",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malware"
+                            "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "ts": 1382355032.706468,
+                        "starttime": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "origstate": "SRPA_SPA",
-                        "state": "Established",
-                        "pkts": 67,
-                        "allbytes": 67696,
+                        "state": "SF",
                         "spkts": 1,
+                        "dpkts": 0,
                         "sbytes": 100,
+                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Normal",
+                        "label": "Benign",
                         "module_labels": {
-                            "flowalerts-long-connection": "Normal"
+                            "flowalerts-long-connection": "Benign"
                         },
                     }
                 )
@@ -314,42 +304,51 @@ def process_flows(self):
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flow(self):
+    def process_flow(self, flow_to_process: dict):
         """
         Process one flow. Only used during detection in testing
-        Store the pandas df in self.flow
+        returns the pandas df with the processed flow
         """
         try:
             # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(self.flow_dict, index=[0])
-            # Process features
+            raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
             # Update the flow to the processed version
-            self.flow = dflow
+            return dflow
         except Exception:
             # Stop the timer
             self.print("Error in process_flow()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def detect(self):
+    def detect(self, x_flow) -> Optional[numpy.ndarray]:
         """
-        Detect this flow with the current model stored
+        Detects the given flow with the current model stored
+        and returns the predection array
         """
         try:
-            # Store the real label if there is one
-            # y_flow = self.flow["label"]
-            # remove the real label column
-            self.flow = self.flow.drop("label", axis=1)
-            # remove the label predictions column of the other modules
-            X_flow = self.flow.drop("module_labels", axis=1)
+            # clean the flow
+            fields_to_drop = [
+                "label",
+                "module_labels",
+                "uid",
+                "history",
+                "dir_",
+                "endtime",
+                "flow_source",
+            ]
+            for field in fields_to_drop:
+                try:
+                    x_flow = x_flow.drop(field, axis=1)
+                except (KeyError, ValueError):
+                    pass
             # Scale the flow
-            X_flow = self.scaler.transform(X_flow)
-            pred = self.clf.predict(X_flow)
+            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
+            pred: numpy.ndarray = self.clf.predict(x_flow)
             return pred
-        except Exception:
-            # Stop the timer
-            self.print("Error in detect() X_flow:")
-            self.print(X_flow)
+        except Exception as e:
+            self.print(
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+            )
             self.print(traceback.format_exc(), 0, 1)
 
     def store_model(self):
@@ -357,10 +356,10 @@ def store_model(self):
         Store the trained model on disk
         """
         self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open("./modules/flowmldetection/model.bin", "wb") as f:
+        with open(self.model_path, "wb") as f:
             data = pickle.dumps(self.clf)
             f.write(data)
-        with open("./modules/flowmldetection/scaler.bin", "wb") as g:
+        with open(self.scaler_path, "wb") as g:
             data = pickle.dumps(self.scaler)
             g.write(data)
 
@@ -370,20 +369,23 @@ def read_model(self):
         """
         try:
             self.print("Reading the trained model from disk.", 0, 2)
-            with open("./modules/flowmldetection/model.bin", "rb") as f:
+            with open(self.model_path, "rb") as f:
                 self.clf = pickle.load(f)
             self.print("Reading the trained scaler from disk.", 0, 2)
-            with open("./modules/flowmldetection/scaler.bin", "rb") as g:
+            with open(self.scaler_path, "rb") as g:
                 self.scaler = pickle.load(g)
         except FileNotFoundError:
             # If there is no model, create one empty
-            self.print("There was no model. Creating a new empty model.", 0, 2)
+            self.print(
+                "There was no model. " "Creating a new empty model.", 0, 2
+            )
             self.clf = SGDClassifier(
                 warm_start=True, loss="hinge", penalty="l1"
             )
         except EOFError:
             self.print(
-                "Error reading model from disk. Creating a new empty model.",
+                "Error reading model from disk. "
+                "Creating a new empty model.",
                 0,
                 2,
             )
@@ -391,39 +393,36 @@ def read_model(self):
                 warm_start=True, loss="hinge", penalty="l1"
             )
 
-    def set_evidence_malicious_flow(
-        self,
-        saddr: str,
-        sport: str,
-        daddr: str,
-        dport: str,
-        twid: str,
-        uid: str,
-    ):
+    def set_evidence_malicious_flow(self, flow: dict, twid: str):
         confidence: float = 0.1
-        ip_identification = self.db.get_ip_identification(daddr)
         description = (
-            f"Malicious flow by ML. Src IP {saddr}:{sport} to "
-            f"{daddr}:{dport} {ip_identification}"
-        )
-
-        timestamp = utils.convert_format(
-            datetime.datetime.now(), utils.alerts_format
+            f"Flow with malicious characteristics by ML. Src IP"
+            f" {flow['saddr']}:{flow['sport']} to "
+            f"{flow['daddr']}:{flow['dport']}"
         )
-
+        twid_number = int(twid.replace("timewindow", ""))
         evidence: Evidence = Evidence(
             evidence_type=EvidenceType.MALICIOUS_FLOW,
             attacker=Attacker(
-                direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr
+                direction=Direction.SRC,
+                ioc_type=IoCType.IP,
+                value=flow["saddr"],
+            ),
+            victim=Victim(
+                direction=Direction.DST,
+                ioc_type=IoCType.IP,
+                value=flow["daddr"],
             ),
             threat_level=ThreatLevel.LOW,
             confidence=confidence,
             description=description,
-            profile=ProfileID(ip=saddr),
-            timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))),
-            uid=[uid],
-            timestamp=timestamp,
-            category=IDEACategory.ANOMALY_TRAFFIC,
+            profile=ProfileID(ip=flow["saddr"]),
+            timewindow=TimeWindow(twid_number),
+            uid=[flow["uid"]],
+            timestamp=flow["starttime"],
+            method=Method.AI,
+            src_port=flow["sport"],
+            dst_port=flow["dport"],
         )
 
         self.db.set_evidence(evidence)
@@ -440,17 +439,20 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            data = msg["data"]
-            data = json.loads(data)
-            # profileid = data["profileid"]
-            twid = data["twid"]
-            flow = data["flow"]
-            flow = json.loads(flow)
-            # Convert the common fields to something that can
-            # be interpreted
-            # Get the uid which is the key
-            uid = next(iter(flow))
-            self.flow_dict = json.loads(flow[uid])
+            # When a new flow arrives
+            msg = json.loads(msg["data"])
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
+            self.flow = msg["flow"]
+            # These following extra fields are expected in testing. update the original
+            # flow dict to have them
+            self.flow.update(
+                {
+                    "state": msg["interpreted_state"],
+                    "label": msg["label"],
+                    "module_labels": msg["module_labels"],
+                }
+            )
 
             if self.mode == "train":
                 # We are training
@@ -459,55 +461,69 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_lables_to_retrain
-                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain' amount of labels
-                    # So for example we retrain every 100 labels and only when we have at least 100 labels
-                    self.print(
-                        f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}."
-                    )
-                    # Process all flows in the DB and make them ready for pandas
-                    self.process_flows()
-                    # Train an algorithm
-                    self.train()
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows()
+                        # Train an algorithm
+                        self.train()
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
-                self.process_flow()
+                processed_flow = self.process_flow(self.flow)
 
-                # After processing the flow, it may happen that we delete icmp/arp/etc
-                # so the dataframe can be empty
-                if self.flow is not None and not self.flow.empty:
+                # After processing the flow, it may happen that we
+                # delete icmp/arp/etc so the dataframe can be empty
+                if processed_flow is not None and not processed_flow.empty:
                     # Predict
-                    pred = self.detect()
-                    label = self.flow_dict["label"]
+                    pred: numpy.ndarray = self.detect(processed_flow)
+                    if not pred:
+                        # an error occurred
+                        return
 
-                    # Report
+                    label = self.flow["label"]
                     if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode, and the label
-                        # is diff from the prediction, print in debug mode
+                        # If the user specified a label in test mode,
+                        # and the label is diff from the prediction,
+                        # print in debug mode
                         self.print(
-                            f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
-                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
-                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} ->'
+                            f' {self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
                             0,
                             3,
                         )
-                    if pred[0] == "Malware":
+                    if pred[0] == "Malicious":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(
-                            self.flow_dict["saddr"],
-                            self.flow_dict["sport"],
-                            self.flow_dict["daddr"],
-                            self.flow_dict["dport"],
-                            twid,
-                            uid,
-                        )
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
-                            f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
-                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
-                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
+                            f"Prediction {pred[0]} for label {label}"
+                            f' flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} -> '
+                            f'{self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
                             0,
                             2,
                         )

From 76ae27f6a3389245e3fd6365f6176415ae1d7b61 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:22:38 +0100
Subject: [PATCH 293/455] flowml. If the dataset has one flow and that is
 deleted, then return empty fast.

---
 modules/flowmldetection/flowmldetection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e6ea0b5171..0fa1e4d767 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -135,6 +135,11 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
             # For now, discard these
             to_drop = [
                 "appproto",

From e216d5bce7de6261f5b9f4cf99d5a6212d79338d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:23:05 +0100
Subject: [PATCH 294/455] flowml. If the datasert is empty. Return none

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 0fa1e4d767..5c5f9943f1 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
             # Update the flow to the processed version
             return dflow
         except Exception:

From 90e2344f104ac3bc43ad17e6c18151b7939764e2 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:27:16 +0100
Subject: [PATCH 295/455] First new version of the model and scaler. Not good
 yet, but working.

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1090 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644
GIT binary patch
delta 130
zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
kK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162Zxn3FXGFXLA@Pyhe`

delta 131
zcmV-}0DS+#2;>L^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8
z4q<hm<v;E**(1biC&1;CKfROVA3%9ur;SK<>_05T(v)g91VHfmFeIMvRKFpJJ~89v
lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl#

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK


From 0caa44da850bfb4c3dc83575f7e287b316ff565d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:06 +0100
Subject: [PATCH 296/455] model and scaler with 1 malicious and 1 benign

---
 modules/flowmldetection/model.bin  | Bin 1090 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644
GIT binary patch
delta 132
zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSY<wU!LGn&}gUNx$3tLYnGl@A&e4S}J
zBSY*!uI-OaTr&&~Tw#uuJJhD?&}O=|yn5R|`<q--?Rc;A9e69=F=gK7d-lipbZtG?
j>K$N!g2|oC+8oRs(l<p`Iv<$4hdGq_%F%|&A}o3UHQY43

delta 131
zcmV-}0DS-C2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
lK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162cym;jSD1TGrII8guq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index bfba4d107224e5e6e5a1e8c8f4d463b48131d111..758909b289238ff282b2e056a9b3e83768b8472a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi%8H3jhEB

delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC


From ac2c493c5f73e51838e77862a06a8ed0d7c77fb7 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:27 +0100
Subject: [PATCH 297/455] cleaner jupyter

---
 modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 5c5f9943f1..fe950ed4bb 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "endtime",
                 "flow_source",
             ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)

From b57b591133d2579418191ead001227c27d258432 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 22:26:27 +0100
Subject: [PATCH 298/455] New models after 3rd ttrain

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644
GIT binary patch
delta 99
zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~Cof{2$owSaz+^KPJplbX
BD~|vG

delta 99
zcmV-p0G$8i2;>N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q;
zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG
F1TN2UERg^J

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644
GIT binary patch
delta 43
zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G|lbQl%0g;md19^W6
B5HkP(

delta 43
zcmV+`0M!5b2KolDfdU!a4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G}lbQl%0g#gc19^VR
B5HSD%


From 8faa14d44606f809da49d8b5bcfcd65ba7b66724 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 00:08:50 +0100
Subject: [PATCH 299/455] Models after 4th train

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644
GIT binary patch
delta 120
zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
as{<<t5IC<!D(NIZlimYX0{noJHUuueR5UOE

delta 120
zcmaFD@q}YTFmr_2vYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~;rz*~nYB5XJEU)ltaLsw
P`5to^Q^<kIW-NLDlQb|V

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 821344a0c69d116622b02e2a0daa1554cb5d308e..29df65342047c5a499ee3f8e602d1f47cb7e9fca 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8kA
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi>ty4FCWD


From 259169c206001f6495880b8fcc942fd7b87878e9 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 08:28:59 +0100
Subject: [PATCH 300/455] Models of ml flow with the first good performance in
 small tests

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644
GIT binary patch
delta 121
zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1();
zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y<G(@KnDch(0mn<s!fm-ctwS<Lp0^
bs{<<tAstXx>?b6^limYW1McwvlQsk{8#y@u

delta 121
zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
bs{<<t5IC<!D(NIZlimYW1LXXGlQsk{_eeDq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 29df65342047c5a499ee3f8e602d1f47cb7e9fca..17115724b9536f6093f9d72f3b58a5c22c562a9a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK

delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T


From 0789af56c5c7b8d00382002ef30f5b5d30e9a92f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:49:23 +0000
Subject: [PATCH 301/455] Add plot for flowml train scores

---
 modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 modules/flowmldetection/plot_train_score.py

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
new file mode 100644
index 0000000000..0b5b5b72ba
--- /dev/null
+++ b/modules/flowmldetection/plot_train_score.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import re
+import sys
+
+def plot_log_data(file_path):
+    # Read the log data from the file
+    with open(file_path, 'r') as file:
+        log_data = file.read()
+
+    # Define regex pattern to extract relevant data from each line
+    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
+
+    # Parse the log file
+    data = re.findall(pattern, log_data)
+
+    # Convert data to a DataFrame
+    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
+    df = df.astype({
+        "Background": int,
+        "Benign": int,
+        "Malicious": int,
+        "Total labels": float,
+        "Score": float
+    })
+
+    # Plotting the values
+    fig, ax1 = plt.subplots(figsize=(10, 6))
+
+    # Plotting Score on the left y-axis
+    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
+    ax1.set_xlabel('Index')
+    ax1.set_ylabel('Score', color='tab:blue')
+    ax1.tick_params(axis='y', labelcolor='tab:blue')
+
+    # Create the second y-axis for the Total labels
+    ax2 = ax1.twinx()
+    ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
+    ax2.set_ylabel('Total labels', color='tab:red')
+    ax2.tick_params(axis='y', labelcolor='tab:red')
+
+    # Adding title and legend
+    plt.title('Log Data Visualization')
+    fig.tight_layout()
+
+    # Save plot to a PNG file
+    plt.savefig('log_data_plot_with_two_scales.png')
+
+    # Display the plot
+    plt.show()
+
+# Make sure the file path is passed as an argument
+if len(sys.argv) < 2:
+    print("Please provide the path to the log file as a parameter.")
+else:
+    plot_log_data(sys.argv[1])

From 6c4e7f16e84bc7d501031d7209fc3975087ef1c3 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:04 +0000
Subject: [PATCH 302/455] Add a log file to store the training data output

---
 modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index fe950ed4bb..60217ada28 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -68,12 +68,29 @@ def init(self):
         self.model_path = "./modules/flowmldetection/model.bin"
         self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
+        # Initialize the training log file
+        self.training_log_path = "./modules/flowmldetection/training.log"
+        with open(self.training_log_path, "w") as log_file:
+            log_file.write("Training Log Initialized\n")
+
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        # This is the global label in the configuration,
+        # in case the flows do not have a label themselves
         self.label = conf.label()
 
-    def train(self):
+    def write_to_training_log(self, message: str):
+        """
+        Write a message to the training log file.
+        """
+        try:
+            with open(self.training_log_path, "a") as log_file:
+                log_file.write(message + "\n")
+        except Exception as e:
+            self.print(f"Error writing to training log: {e}", 0, 1)
+
+    def train(self, sum_labeled_flows):
         """
         Train a model based on the flows we receive and the labels
         """

From d1f4f4873e56c4a5ffea27e384d75a244c3dc717 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:32 +0000
Subject: [PATCH 303/455] Store data in the log file of training

---
 modules/flowmldetection/flowmldetection.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 60217ada28..6f732da636 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -137,9 +137,13 @@ def train(self, sum_labeled_flows):
             # Store the models on disk
             self.store_model()
 
+            # Log training information
+            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
+            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+            self.write_to_training_log("Error occurred during training.")
 
     def process_features(self, dataset):
         """

From 38347dcbcd0a5bd2f8f0313160d26aadb4d460aa Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:53 +0000
Subject: [PATCH 304/455] better comments

---
 modules/flowmldetection/flowmldetection.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6f732da636..ed3aecf1b0 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -59,10 +59,9 @@ def init(self):
         self.minimum_labels_to_start_train = 50
         # Minum amount of new labels needed to retrain
         self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained
+        # The number of flows when last trained. Used internally only to know
+        # when to retrain
         self.last_number_of_flows_when_trained = 0
-        # To plot the scores of training
-        # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
         self.model_path = "./modules/flowmldetection/model.bin"

From b9ff8e3090942b37c032fb535a31d6518b22fae7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:51:30 +0000
Subject: [PATCH 305/455] Fix issue not dropping detailed labels

---
 modules/flowmldetection/flowmldetection.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index ed3aecf1b0..25b30cf515 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -94,23 +94,19 @@ def train(self, sum_labeled_flows):
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Get the flows from the DB
-            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
-            # Convert to pandas df
-            # self.flows = pd.DataFrame(self.flows)
-            # Process the features
-            # X_flow = self.process_features(self.flows)
-
             # Create X_flow with the current flows minus the label
-            X_flow = self.flows.drop("label", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.label)
+            X_flow = self.flows.drop("ground_truth_label", axis=1)
+            # Drop the detailed labels
+            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
             # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
 
             # Normalize this batch of data so far. This can get progressivle slow
             X_flow = self.scaler.fit_transform(X_flow)
 
+
             # Train
             try:
                 self.clf.partial_fit(

From 8da38939309e7bc3cb878b4c4c20ae2dd8bb56e1 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:51:53 +0000
Subject: [PATCH 306/455] Fix issue that not all labels sere given to the
 partial fit

---
 modules/flowmldetection/flowmldetection.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 25b30cf515..b2d0db5e51 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -109,8 +109,9 @@ def train(self, sum_labeled_flows):
 
             # Train
             try:
+                # Online incremental learning
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malicious", "Benign"]
+                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")

From f1b5b683153abe35d4b28dbc03152bebfa4cb8a2 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:52:08 +0000
Subject: [PATCH 307/455] count partial labels in this epoch

---
 modules/flowmldetection/flowmldetection.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index b2d0db5e51..1146091a92 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -106,6 +106,12 @@ def train(self, sum_labeled_flows):
             # Normalize this batch of data so far. This can get progressivle slow
             X_flow = self.scaler.fit_transform(X_flow)
 
+            # Count the number of labels of each type in this epoc
+            epoch_label_counts = {
+                "Background": (y_flow == "Background").sum(),
+                "Malicious": (y_flow == "Malicious").sum(),
+                "Benign": (y_flow == "Benign").sum(),
+            }
 
             # Train
             try:

From 84480185bdbd1eb9887b86fcc75a889e43f57964 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:55:09 +0000
Subject: [PATCH 308/455] Dont print training in screen

---
 modules/flowmldetection/flowmldetection.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 1146091a92..4bb2ad7dbf 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -126,15 +126,8 @@ def train(self, sum_labeled_flows):
             # See score so far in training
             score = self.clf.score(X_flow, y_flow)
 
-            # To debug the training score
-            # self.scores.append(score)
-
-            self.print(f"	Training Score: {score}", 0, 1)
-            # self.print(f'    Model Parameters: {self.clf.coef_}')
-
-            # Debug code to store a plot in a png of the scores
-            # plt.plot(self.scores)
-            # plt.savefig('train-scores.png')
+            #self.print(f"	Training Score: {score}", 1, 0)
+            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
 
             # Store the models on disk
             self.store_model()

From 7c2b383edbda7283716ebc5b894fd5d8fc62f7da Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:55:28 +0000
Subject: [PATCH 309/455] Add function to write to train log

---
 modules/flowmldetection/flowmldetection.py | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4bb2ad7dbf..d4b2762f5f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -247,28 +247,28 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_training_flows(self):
+    def process_training_flows(self, last_number_of_flows_when_trained):
         """
-        Process all the flows in the DB
+        Process only the new flows in the DB since the last training.
         Store the pandas df in self.flows
         """
         try:
+            # Ensure the index is an integer
+            if last_number_of_flows_when_trained is None:
+                last_number_of_flows_when_trained = 0
+            else:
+                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+
             # We get all the flows so far
-            # because this retraining happens in batches
             flows = self.db.get_all_flows()
-            # Check how many different labels are in the DB
-            # We need both normal and malware
+            # Only process new flows since last training
+            new_flows = flows[last_number_of_flows_when_trained:]
+
+            # Check how many **different** labels are in the DB
             labels = self.db.get_labels()
             if len(labels) == 1:
-                # Only 1 label has flows
-                # There are not enough different labels, so insert two flows
-                # that are fake but representative of a normal and malware flow
-                # they are only for the training process
-                # At least 1 flow of each label is required
-
-                # These flows should be in the same format as the ones in the DB. 
-                # Which means the satate is still SF, S0, etc.
-                flows.append(
+                # Insert fake flows for both classes if needed
+                new_flows.append(
                     {
                         "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",

From ad07f7c245eea515e4395b1216f3c564068067ae Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:57:27 +0000
Subject: [PATCH 310/455] Fix label in dummy flow

---
 modules/flowmldetection/flowmldetection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index d4b2762f5f..6a44422cc2 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 25517,
                         "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malicious",
+                        "ground_truth_label": "Malicious",
                         "module_labels": {
                             "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
-                flows.append(
+                new_flows.append(
                     {
                         "starttime": 1382355032.706468,
                         "dur": "10.896695",

From d3736905508aa9dbcfbd7044532d0aed3501db5f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:57:39 +0000
Subject: [PATCH 311/455] Fix dummy flow

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6a44422cc2..20f1f8ca89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 100,
                         "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Benign",
+                        "ground_truth_label": "Benign",
                         "module_labels": {
                             "flowalerts-long-connection": "Benign"
                         },

From 867da84a20fb4c6b695906f94c9ba1b7b967d38d Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:58:28 +0000
Subject: [PATCH 312/455] Rename variable

---
 modules/flowmldetection/flowmldetection.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 20f1f8ca89..59064d61a5 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         },
                     }
                 )
-                # If there are enough flows, we dont insert them anymore
 
             # Convert to pandas df
-            df_flows = pd.DataFrame(flows)
+            df_flows = pd.DataFrame(new_flows)
 
             # Process features
             df_flows = self.process_features(df_flows)
@@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained):
             # Update the flow to the processed version
             self.flows = df_flows
         except Exception:
-            # Stop the timer
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 

From aeebcbc24872621b69dd030456ccea86053e2948 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:00:32 +0000
Subject: [PATCH 313/455] Fix dummy flow label

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 59064d61a5..6b41b40298 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -356,6 +356,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "dir_",
                 "endtime",
                 "flow_source",
+                "ground_truth_label",
+                "detailed_ground_truth_label",
             ]
             # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
             # Error

From 5fef371864f1faa6d45f5ad54813dd4b5354171f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:00:47 +0000
Subject: [PATCH 314/455] Pass values to train function

---
 modules/flowmldetection/flowmldetection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6b41b40298..4d66aab855 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -521,9 +521,9 @@ def main(self):
                         )
                         # Process all flows in the DB and make them ready
                         # for pandas
-                        self.process_training_flows()
+                        self.process_training_flows(self.last_number_of_flows_when_trained)
                         # Train an algorithm
-                        self.train()
+                        self.train(sum_labeled_flows)
                         self.last_number_of_flows_when_trained = sum_labeled_flows
 
             elif self.mode == "test":

From 3d8f125ec27114c35e5c552cbbf7c1c5d3baadb4 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:01:47 +0000
Subject: [PATCH 315/455] import os

---
 modules/flowmldetection/flowmldetection.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4d66aab855..766178e127 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import os
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils

From 260d6845ce3775c84f93cc6a79f04812c9ca50be Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:02:15 +0000
Subject: [PATCH 316/455] Get issue of total flows zero

---
 slips_files/core/database/database_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 0b805976df..b32c004a32 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -661,7 +661,8 @@ def add_software_to_profile(self, *args, **kwargs):
         return self.rdb.add_software_to_profile(*args, **kwargs)
 
     def get_total_flows(self, *args, **kwargs):
-        return int(self.rdb.get_total_flows(*args, **kwargs))
+        total_flows = self.rdb.get_total_flows(*args, **kwargs)
+        return int(total_flows) if total_flows is not None else 0
 
     def increment_processed_flows(self, *args, **kwargs):
         return self.rdb.increment_processed_flows(*args, **kwargs)

From c65e8f15d3e641afe585428f2526c6f50117c791 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:02:32 +0000
Subject: [PATCH 317/455] Add comments

---
 slips_files/core/database/database_manager.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index b32c004a32..1d339685f8 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -888,7 +888,10 @@ def get_flow(self, *args, **kwargs):
         """returns the raw flow as read from the log file"""
         return self.sqlite.get_flow(*args, **kwargs)
 
-    def add_flow(self, flow, profileid: str, twid: str, label="benign"):
+    def add_flow(self, flow, profileid: str, twid: str, label="Benign"):
+        """
+        Just in case, by default if there are no labels in the flow, we consider it Benign
+        """
         # stores it in the db
         self.sqlite.add_flow(flow, profileid, twid, label=label)
         # handles the channels and labels etc.

From 8ae122121f8d9ccca31942e3d7b7f64cd48c8bad Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:02:51 +0000
Subject: [PATCH 318/455] Rename var name to be more clear

---
 slips_files/core/profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py
index 0d9b11bd27..d22069d9e6 100644
--- a/slips_files/core/profiler.py
+++ b/slips_files/core/profiler.py
@@ -119,7 +119,7 @@ def read_configuration(self):
         self.local_whitelist_path = conf.local_whitelist_path()
         self.timeformat = conf.ts_format()
         self.analysis_direction = conf.analysis_direction()
-        self.label = conf.label()
+        self.configuration_label = conf.label()
         self.width = conf.get_tw_width_as_float()
         self.client_ips: List[
             Union[IPv4Network, IPv6Network, IPv4Address, IPv6Address]

From 5fbe43ad6bb445795cb8a7c2317cf6b91acecfd0 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:10 +0000
Subject: [PATCH 319/455] Rename var name

---
 slips_files/core/profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py
index d22069d9e6..e8fdf5cc56 100644
--- a/slips_files/core/profiler.py
+++ b/slips_files/core/profiler.py
@@ -377,7 +377,7 @@ def store_features_going_in(self, profileid: str, twid: str, flow):
             flow,
             profileid=profileid,
             twid=twid,
-            label=self.label,
+            label=self.configuration_label,
         )
         self.db.mark_profile_tw_as_modified(profileid, twid, "")
 

From 85ac73dca750a6467e9e345b2daa42ebe4dded90 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:31 +0000
Subject: [PATCH 320/455] Fix processeed flows being zero

---
 slips/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slips/main.py b/slips/main.py
index b00cc8f3db..3f661c8843 100644
--- a/slips/main.py
+++ b/slips/main.py
@@ -414,7 +414,7 @@ def get_analyzed_flows_percentage(self) -> str:
             self.total_flows = self.db.get_total_flows()
 
         flows_percentage = int(
-            (self.db.get_processed_flows_so_far() / self.total_flows) * 100
+            (self.db.get_processed_flows_so_far() / self.total_flows) * 100 if self.total_flows != 0 else 0
         )
         return f"Analyzed Flows: {green(flows_percentage)}{green('%')}. "
 

From 058b603df40e65aa6dad514fbed6aaa1c9362bcb Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:53 +0000
Subject: [PATCH 321/455] Delete old comments

---
 modules/flowmldetection/flowmldetection.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 766178e127..6c3bfc1275 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -28,10 +28,6 @@
     Method,
 )
 
-# Only for debbuging
-# from matplotlib import pyplot as plt
-
-
 # This horrible hack is only to stop sklearn from printing those warnings
 def warn(*args, **kwargs):
     pass

From ff9eff155b4989bdecf1b60d34e97f739a5510f7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:13:22 +0000
Subject: [PATCH 322/455] Fix plots

---
 modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++-----
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 0b5b5b72ba..359df04eff 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -2,6 +2,8 @@
 import matplotlib.pyplot as plt
 import re
 import sys
+import argparse
+import os
 
 def plot_log_data(file_path):
     # Read the log data from the file
@@ -24,33 +26,59 @@ def plot_log_data(file_path):
         "Score": float
     })
 
+    # Get the directory of the log file to store the plot in the same folder
+    dir_name = os.path.dirname(file_path)
+    plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png')
+
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))
 
-    # Plotting Score on the left y-axis
+    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
     ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
     ax1.set_xlabel('Index')
     ax1.set_ylabel('Score', color='tab:blue')
+    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
     ax1.tick_params(axis='y', labelcolor='tab:blue')
 
-    # Create the second y-axis for the Total labels
+    # Create the second y-axis for the Background, Benign, Malicious, Total labels
     ax2 = ax1.twinx()
+    ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
+    ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
+    ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
     ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
-    ax2.set_ylabel('Total labels', color='tab:red')
+    ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red')
+    
+    # Set appropriate scale for right y-axis based on the data
+    ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max())
     ax2.tick_params(axis='y', labelcolor='tab:red')
 
     # Adding title and legend
     plt.title('Log Data Visualization')
     fig.tight_layout()
 
-    # Save plot to a PNG file
-    plt.savefig('log_data_plot_with_two_scales.png')
+    # Adding the legend with increased space for readability
+    ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
+    ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small')
+
+    # Increase right margin for better readability of legend
+    plt.subplots_adjust(right=0.75)
+
+    # Save plot to the same folder as the log file
+    plt.savefig(plot_file)
 
     # Display the plot
     plt.show()
 
-# Make sure the file path is passed as an argument
-if len(sys.argv) < 2:
-    print("Please provide the path to the log file as a parameter.")
-else:
-    plot_log_data(sys.argv[1])
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
+    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
+    
+    # Handle -h / --help
+    args = parser.parse_args()
+
+    # Call the function to process the log file
+    plot_log_data(args.log_file)
+
+if __name__ == "__main__":
+    main()

From e55edf8709ac90ca8e30de4d3bf1d3d381c7ff3b Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:14:58 +0000
Subject: [PATCH 323/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 359df04eff..c7f374a7fe 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -40,18 +40,21 @@ def plot_log_data(file_path):
     ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
     ax1.tick_params(axis='y', labelcolor='tab:blue')
 
-    # Create the second y-axis for the Background, Benign, Malicious, Total labels
+    # Create the second y-axis for the Background, Benign, Malicious
     ax2 = ax1.twinx()
     ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
     ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
     ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
-    ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
-    ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red')
+    ax2.set_ylabel('Background, Benign, Malicious', color='tab:red')
     
     # Set appropriate scale for right y-axis based on the data
-    ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max())
+    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
     ax2.tick_params(axis='y', labelcolor='tab:red')
 
+    # Annotating Total labels as text on the plot
+    for i, value in enumerate(df["Total labels"]):
+        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
+
     # Adding title and legend
     plt.title('Log Data Visualization')
     fig.tight_layout()

From 5fbff61521b897f5cc047040bbe9adc54eeee126 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:16:23 +0000
Subject: [PATCH 324/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index c7f374a7fe..4099c47c1e 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -42,10 +42,10 @@ def plot_log_data(file_path):
 
     # Create the second y-axis for the Background, Benign, Malicious
     ax2 = ax1.twinx()
-    ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
-    ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
-    ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
-    ax2.set_ylabel('Background, Benign, Malicious', color='tab:red')
+    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
+    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
+    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
+    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
     
     # Set appropriate scale for right y-axis based on the data
     ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
@@ -56,7 +56,7 @@ def plot_log_data(file_path):
         ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
 
     # Adding title and legend
-    plt.title('Log Data Visualization')
+    plt.title('Training performance')
     fig.tight_layout()
 
     # Adding the legend with increased space for readability

From ff987fc2450326739b4635275f24648799f32659 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:24:43 +0000
Subject: [PATCH 325/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 4099c47c1e..8437e968ac 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -59,12 +59,12 @@ def plot_log_data(file_path):
     plt.title('Training performance')
     fig.tight_layout()
 
-    # Adding the legend with increased space for readability
-    ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
-    ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small')
+    # Move both legends further to the right
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
 
     # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.75)
+    plt.subplots_adjust(right=0.7)
 
     # Save plot to the same folder as the log file
     plt.savefig(plot_file)

From bf9d7200d01b9f941612b2f0a83e308225396ab0 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:02:34 +0000
Subject: [PATCH 326/455] Plot testing performance from a log

---
 .../plot_testing_performance.py               | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 modules/flowmldetection/plot_testing_performance.py

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
new file mode 100644
index 0000000000..a38c7f0598
--- /dev/null
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -0,0 +1,89 @@
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+
+def process_file(file_path):
+    # Initialize the counters for the values
+    FPR_values = []
+    FNR_values = []
+    TNR_values = []
+    TPR_values = []
+    F1_values = []
+    accuracy_values = []
+    precision_values = []
+    MCC_values = []
+    recall_values = []
+    
+    # Read the file and extract the data
+    with open(file_path, 'r') as file:
+        for line in file:
+            if "TP:" in line:
+                # Extract the values from the line
+                parts = line.split(',')
+                TP = int(parts[0].split(':')[1].strip())
+                TN = int(parts[1].split(':')[1].strip())
+                FP = int(parts[2].split(':')[1].strip())
+                FN = int(parts[3].split(':')[1].strip())
+
+                # Calculate metrics
+                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
+                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
+                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
+                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
+                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
+                Recall = TPR  # Recall is the same as TPR
+                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
+                Accuracy = (TP + TN) / (TP + TN + FP + FN)
+                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
+                
+                # Append the values to the respective lists
+                FPR_values.append(FPR)
+                FNR_values.append(FNR)
+                TNR_values.append(TNR)
+                TPR_values.append(TPR)
+                F1_values.append(F1)
+                accuracy_values.append(Accuracy)
+                precision_values.append(Precision)
+                MCC_values.append(MCC)
+                recall_values.append(Recall)
+    
+    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
+
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+    # Create the plot
+    plt.figure(figsize=(12, 8))
+    
+    # Plot each metric
+    plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o')
+    plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o')
+    plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o')
+    plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o')
+    plt.plot(F1_values, label='F1 Score', marker='o')
+    plt.plot(accuracy_values, label='Accuracy', marker='o')
+    plt.plot(precision_values, label='Precision', marker='o')
+    plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
+    plt.plot(recall_values, label='Recall (TPR)', marker='o')
+    
+    # Add labels and title
+    plt.xlabel('Index')
+    plt.ylabel('Metric Value')
+    plt.title('Evaluation Metrics Over Time')
+    
+    # Add a legend
+    plt.legend()
+    
+    # Save the plot as a PNG file
+    plt.savefig('metrics_plot.png')
+    plt.close()
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <file_path>")
+        sys.exit(1)
+    
+    file_path = sys.argv[1]
+    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+
+if __name__ == "__main__":
+    main()

From f146fbf84544323511db94d721e971b6da33ad0f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:04:32 +0000
Subject: [PATCH 327/455] Fix the plot

---
 modules/flowmldetection/plot_testing_performance.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index a38c7f0598..fac0acd64a 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
     plt.plot(recall_values, label='Recall (TPR)', marker='o')
     
+    # Set logarithmic scale on the y-axis
+    plt.yscale('log')
+    
     # Add labels and title
     plt.xlabel('Index')
-    plt.ylabel('Metric Value')
-    plt.title('Evaluation Metrics Over Time')
+    plt.ylabel('Metric Value (Log Scale)')
+    plt.title('Evaluation Metrics Over Time (Log Scale)')
     
     # Add a legend
     plt.legend()
     
     # Save the plot as a PNG file
-    plt.savefig('metrics_plot.png')
+    plt.savefig('metrics_plot_log_scale.png')
     plt.close()
 
 def main():

From 37bf4f6a0c187b76a443c3f1f855f0278da65065 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:12:40 +0000
Subject: [PATCH 328/455] Fix the plots

---
 .../plot_testing_performance.py               | 76 ++++++++++++++-----
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index fac0acd64a..5581c72cd4 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -50,33 +50,66 @@ def process_file(file_path):
     return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
 
 def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
-    # Create the plot
-    plt.figure(figsize=(12, 8))
+    # Separate the values into two groups based on their proximity to 0 or 1
+    close_to_0 = {
+        'FPR': [], 'FNR': []
+    }
+    close_to_1 = {
+        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
+    }
     
-    # Plot each metric
-    plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o')
-    plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o')
-    plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o')
-    plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o')
-    plt.plot(F1_values, label='F1 Score', marker='o')
-    plt.plot(accuracy_values, label='Accuracy', marker='o')
-    plt.plot(precision_values, label='Precision', marker='o')
-    plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
-    plt.plot(recall_values, label='Recall (TPR)', marker='o')
+    # Categorize the metrics into two groups
+    for i in range(len(FPR_values)):
+        close_to_0['FPR'].append(FPR_values[i])
+        close_to_0['FNR'].append(FNR_values[i])
+        
+        close_to_1['TNR'].append(TNR_values[i])
+        close_to_1['TPR'].append(TPR_values[i])
+        close_to_1['F1'].append(F1_values[i])
+        close_to_1['accuracy'].append(accuracy_values[i])
+        close_to_1['precision'].append(precision_values[i])
+        close_to_1['MCC'].append(MCC_values[i])
+        close_to_1['recall'].append(recall_values[i])
+
+    # Plot metrics for values close to 0
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png')
     
-    # Set logarithmic scale on the y-axis
-    plt.yscale('log')
+    # Plot metrics for values close to 1
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+
+def plot_single_group(metrics_dict, output_filename):
+    plt.figure(figsize=(12, 8))
     
-    # Add labels and title
+    # Only plot the metrics that exist in the dictionary
+    if 'FPR' in metrics_dict:
+        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
+    if 'FNR' in metrics_dict:
+        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
+    if 'TNR' in metrics_dict:
+        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
+    if 'TPR' in metrics_dict:
+        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
+    if 'F1' in metrics_dict:
+        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
+    if 'accuracy' in metrics_dict:
+        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
+    if 'precision' in metrics_dict:
+        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
+    if 'MCC' in metrics_dict:
+        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
+    if 'recall' in metrics_dict:
+        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
+
+    # Apply log scale by default
+    plt.yscale('log')
+
     plt.xlabel('Index')
-    plt.ylabel('Metric Value (Log Scale)')
-    plt.title('Evaluation Metrics Over Time (Log Scale)')
-    
-    # Add a legend
+    plt.ylabel('Metric Value')
+    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
     plt.legend()
     
-    # Save the plot as a PNG file
-    plt.savefig('metrics_plot_log_scale.png')
+    # Save the plot
+    plt.savefig(output_filename)
     plt.close()
 
 def main():
@@ -85,6 +118,7 @@ def main():
         sys.exit(1)
     
     file_path = sys.argv[1]
+    
     FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
     plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
 

From 5936fc882ebfb7a8e82c4b8696891d6ead982194 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:16:50 +0000
Subject: [PATCH 329/455] Fix plot

---
 .../plot_testing_performance.py               | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 5581c72cd4..8f9e12cd86 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png')
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1
     plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
 
-def plot_single_group(metrics_dict, output_filename):
+    # Print the final values
+    print("\nFinal Metric Values:")
+    print(f"Final FPR: {FPR_values[-1]:.4f}")
+    print(f"Final FNR: {FNR_values[-1]:.4f}")
+    print(f"Final TNR: {TNR_values[-1]:.4f}")
+    print(f"Final TPR: {TPR_values[-1]:.4f}")
+    print(f"Final F1 Score: {F1_values[-1]:.4f}")
+    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
+    print(f"Final Precision: {precision_values[-1]:.4f}")
+    print(f"Final MCC: {MCC_values[-1]:.4f}")
+    print(f"Final Recall: {recall_values[-1]:.4f}")
+
+def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     plt.figure(figsize=(12, 8))
     
     # Only plot the metrics that exist in the dictionary
@@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename):
     # Apply log scale by default
     plt.yscale('log')
 
+    # If the plot is close to 0, set custom ticks
+    if is_close_to_0:
+        # Manually set more Y-ticks for better visibility
+        plt.ylim(0.0001, 1)  # Set Y-axis limits between 0.0001 and 1
+        plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1'])  # Adjust Y-ticks
+
     plt.xlabel('Index')
     plt.ylabel('Metric Value')
     plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')

From bfc10bea2cf0ec9e6ce3f2a66484cd023f58e4ad Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:20:22 +0000
Subject: [PATCH 330/455] Fix plots

---
 modules/flowmldetection/flowmldetection.py | 709 +++++----------------
 1 file changed, 143 insertions(+), 566 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6c3bfc1275..37f0761109 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,566 +1,143 @@
-# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
-from typing import Optional
-
-# SPDX-License-Identifier: GPL-2.0-only
-import numpy
-from sklearn.linear_model import SGDClassifier
-from sklearn.preprocessing import StandardScaler
-import pickle
-import pandas as pd
-import json
-import traceback
-import warnings
-import os
-
-from slips_files.common.parsers.config_parser import ConfigParser
-from slips_files.common.slips_utils import utils
-from slips_files.common.abstracts.module import IModule
-from slips_files.core.structures.evidence import (
-    Evidence,
-    ProfileID,
-    TimeWindow,
-    Attacker,
-    ThreatLevel,
-    EvidenceType,
-    IoCType,
-    Direction,
-    Victim,
-    Method,
-)
-
-# This horrible hack is only to stop sklearn from printing those warnings
-def warn(*args, **kwargs):
-    pass
-
-
-warnings.warn = warn
-
-
-class FlowMLDetection(IModule):
-    # Name: short name of the module. Do not use spaces
-    name = "Flow ML Detection"
-    description = (
-        "Train or test a Machine Learning model to detect malicious flows"
-    )
-    authors = ["Sebastian Garcia"]
-
-    def init(self):
-        # Subscribe to the channel
-        self.c1 = self.db.subscribe("new_flow")
-        self.channels = {"new_flow": self.c1}
-        self.fieldseparator = self.db.get_field_separator()
-        # Set the output queue of our database instance
-        # Read the configuration
-        self.read_configuration()
-        # Minum amount of new labels needed to start the train
-        self.minimum_labels_to_start_train = 50
-        # Minum amount of new labels needed to retrain
-        self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained. Used internally only to know
-        # when to retrain
-        self.last_number_of_flows_when_trained = 0
-        # The scaler trained during training and to use during testing
-        self.scaler = StandardScaler()
-        self.model_path = "./modules/flowmldetection/model.bin"
-        self.scaler_path = "./modules/flowmldetection/scaler.bin"
-
-        # Initialize the training log file
-        self.training_log_path = "./modules/flowmldetection/training.log"
-        with open(self.training_log_path, "w") as log_file:
-            log_file.write("Training Log Initialized\n")
-
-    def read_configuration(self):
-        conf = ConfigParser()
-        self.mode = conf.get_ml_mode()
-        # This is the global label in the configuration,
-        # in case the flows do not have a label themselves
-        self.label = conf.label()
-
-    def write_to_training_log(self, message: str):
-        """
-        Write a message to the training log file.
-        """
-        try:
-            with open(self.training_log_path, "a") as log_file:
-                log_file.write(message + "\n")
-        except Exception as e:
-            self.print(f"Error writing to training log: {e}", 0, 1)
-
-    def train(self, sum_labeled_flows):
-        """
-        Train a model based on the flows we receive and the labels
-        """
-        try:
-            # Create X_flow with the current flows minus the label
-            X_flow = self.flows.drop("ground_truth_label", axis=1)
-            # Drop the detailed labels
-            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
-            # Drop the module_labels
-            X_flow = X_flow.drop("module_labels", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
-
-            # Normalize this batch of data so far. This can get progressivle slow
-            X_flow = self.scaler.fit_transform(X_flow)
-
-            # Count the number of labels of each type in this epoc
-            epoch_label_counts = {
-                "Background": (y_flow == "Background").sum(),
-                "Malicious": (y_flow == "Malicious").sum(),
-                "Benign": (y_flow == "Benign").sum(),
-            }
-
-            # Train
-            try:
-                # Online incremental learning
-                self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
-                )
-            except Exception:
-                self.print("Error while calling clf.train()")
-                self.print(traceback.format_exc(), 0, 1)
-
-            # See score so far in training
-            score = self.clf.score(X_flow, y_flow)
-
-            #self.print(f"	Training Score: {score}", 1, 0)
-            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
-
-            # Store the models on disk
-            self.store_model()
-
-            # Log training information
-            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
-            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
-        except Exception:
-            self.print("Error in train().", 0, 1)
-            self.print(traceback.format_exc(), 0, 1)
-            self.write_to_training_log("Error occurred during training.")
-
-    def process_features(self, dataset):
-        """
-        Discards some features of the dataset and can create new.
-        Clean the dataset
-        """
-        try:
-            # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
-            for proto in to_discard:
-                dataset = dataset[dataset.proto != proto]
-
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
-            if dataset.empty:
-                # DataFrame is empty now, so return empty
-                return dataset
-
-            # For now, discard these
-            to_drop = [
-                "appproto",
-                "daddr",
-                "saddr",
-                "starttime",
-                "type_",
-                "smac",
-                "dmac",
-                "history",
-                "uid",
-                "dir_",
-                "endtime",
-                "flow_source",
-            ]
-            for field in to_drop:
-                try:
-                    dataset = dataset.drop(field, axis=1)
-                except (ValueError, KeyError):
-                    pass
-
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
-            # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: self.db.get_final_state_from_flags(
-                    row["state"], (row["spkts"] + row["dpkts"])
-                ),
-                axis=1,
-            )
-
-            # Convert state to categorical
-            dataset.state = dataset.state.str.replace(
-                r"(^.*Not Established.*$)", "0", regex=True
-            )
-            dataset.state = dataset.state.str.replace(
-                r"(^.*Established.*$)", "1", regex=True
-            )
-
-            # Convert categories to floats
-            dataset.state = dataset.state.astype("float64")
-
-            # Convert proto to categorical. For now we only have few states, so we can hardcode...
-            # We dont use the data to create categories because in testing mode
-            # we dont see all the protocols
-            # Also we dont store the Categorizer because the user can retrain
-            # with its own data.
-            dataset.proto = dataset.proto.str.lower()
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*tcp.*$)", "0", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*udp.*$)", "1", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*icmp.*$)", "2", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*icmp-ipv6.*$)", "3", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*arp.*$)", "4", regex=True
-            )
-
-            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
-            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
-
-            fields_to_convert_to_float = [
-                dataset.proto,
-                dataset.dport,
-                dataset.sport,
-                dataset.dur,
-                dataset.pkts,
-                dataset.spkts,
-                dataset.allbytes,
-                dataset.sbytes,
-                dataset.state,
-            ]
-            for field in fields_to_convert_to_float:
-                try:
-                    field = field.astype("float64")
-                except (ValueError, AttributeError):
-                    pass
-
-            return dataset
-        except Exception:
-            # Stop the timer
-            self.print("Error in process_features()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def process_training_flows(self, last_number_of_flows_when_trained):
-        """
-        Process only the new flows in the DB since the last training.
-        Store the pandas df in self.flows
-        """
-        try:
-            # Ensure the index is an integer
-            if last_number_of_flows_when_trained is None:
-                last_number_of_flows_when_trained = 0
-            else:
-                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
-
-            # We get all the flows so far
-            flows = self.db.get_all_flows()
-            # Only process new flows since last training
-            new_flows = flows[last_number_of_flows_when_trained:]
-
-            # Check how many **different** labels are in the DB
-            labels = self.db.get_labels()
-            if len(labels) == 1:
-                # Insert fake flows for both classes if needed
-                new_flows.append(
-                    {
-                        "starttime": 1594417039.029793,
-                        "dur": "1.9424750804901123",
-                        "saddr": "10.7.10.101",
-                        "sport": "49733",
-                        "daddr": "40.70.224.145",
-                        "dport": "443",
-                        "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 17,
-                        "dpkts": 27,
-                        "sbytes": 25517,
-                        "dbytes": 17247,
-                        "appproto": "ssl",
-                        "ground_truth_label": "Malicious",
-                        "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
-                        },
-                    }
-                )
-                new_flows.append(
-                    {
-                        "starttime": 1382355032.706468,
-                        "dur": "10.896695",
-                        "saddr": "147.32.83.52",
-                        "sport": "47956",
-                        "daddr": "80.242.138.72",
-                        "dport": "80",
-                        "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 1,
-                        "dpkts": 0,
-                        "sbytes": 100,
-                        "dbytes": 67596,
-                        "appproto": "http",
-                        "ground_truth_label": "Benign",
-                        "module_labels": {
-                            "flowalerts-long-connection": "Benign"
-                        },
-                    }
-                )
-
-            # Convert to pandas df
-            df_flows = pd.DataFrame(new_flows)
-
-            # Process features
-            df_flows = self.process_features(df_flows)
-
-            # Update the flow to the processed version
-            self.flows = df_flows
-        except Exception:
-            self.print("Error in process_flows()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def process_flow(self, flow_to_process: dict):
-        """
-        Process one flow. Only used during detection in testing
-        returns the pandas df with the processed flow
-        """
-        try:
-            # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(flow_to_process, index=[0])
-            dflow = self.process_features(raw_flow)
-            if dflow.empty:
-                return None
-            # Update the flow to the processed version
-            return dflow
-        except Exception:
-            # Stop the timer
-            self.print("Error in process_flow()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def detect(self, x_flow) -> Optional[numpy.ndarray]:
-        """
-        Detects the given flow with the current model stored
-        and returns the predection array
-        """
-        try:
-            # clean the flow
-            fields_to_drop = [
-                "label",
-                "module_labels",
-                "uid",
-                "history",
-                "dir_",
-                "endtime",
-                "flow_source",
-                "ground_truth_label",
-                "detailed_ground_truth_label",
-            ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
-            # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
-
-            # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
-            for field in fields_to_drop:
-                try:
-                    x_flow = x_flow.drop(field, axis=1)
-                except (KeyError, ValueError):
-                    pass
-            # Scale the flow
-            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
-            pred: numpy.ndarray = self.clf.predict(x_flow)
-            return pred
-        except Exception as e:
-            self.print(
-                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
-            )
-            self.print(traceback.format_exc(), 0, 1)
-
-    def store_model(self):
-        """
-        Store the trained model on disk
-        """
-        self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open(self.model_path, "wb") as f:
-            data = pickle.dumps(self.clf)
-            f.write(data)
-        with open(self.scaler_path, "wb") as g:
-            data = pickle.dumps(self.scaler)
-            g.write(data)
-
-    def read_model(self):
-        """
-        Read the trained model from disk
-        """
-        try:
-            self.print("Reading the trained model from disk.", 0, 2)
-            with open(self.model_path, "rb") as f:
-                self.clf = pickle.load(f)
-            self.print("Reading the trained scaler from disk.", 0, 2)
-            with open(self.scaler_path, "rb") as g:
-                self.scaler = pickle.load(g)
-        except FileNotFoundError:
-            # If there is no model, create one empty
-            self.print(
-                "There was no model. " "Creating a new empty model.", 0, 2
-            )
-            self.clf = SGDClassifier(
-                warm_start=True, loss="hinge", penalty="l1"
-            )
-        except EOFError:
-            self.print(
-                "Error reading model from disk. "
-                "Creating a new empty model.",
-                0,
-                2,
-            )
-            self.clf = SGDClassifier(
-                warm_start=True, loss="hinge", penalty="l1"
-            )
-
-    def set_evidence_malicious_flow(self, flow: dict, twid: str):
-        confidence: float = 0.1
-        description = (
-            f"Flow with malicious characteristics by ML. Src IP"
-            f" {flow['saddr']}:{flow['sport']} to "
-            f"{flow['daddr']}:{flow['dport']}"
-        )
-        twid_number = int(twid.replace("timewindow", ""))
-        evidence: Evidence = Evidence(
-            evidence_type=EvidenceType.MALICIOUS_FLOW,
-            attacker=Attacker(
-                direction=Direction.SRC,
-                ioc_type=IoCType.IP,
-                value=flow["saddr"],
-            ),
-            victim=Victim(
-                direction=Direction.DST,
-                ioc_type=IoCType.IP,
-                value=flow["daddr"],
-            ),
-            threat_level=ThreatLevel.LOW,
-            confidence=confidence,
-            description=description,
-            profile=ProfileID(ip=flow["saddr"]),
-            timewindow=TimeWindow(twid_number),
-            uid=[flow["uid"]],
-            timestamp=flow["starttime"],
-            method=Method.AI,
-            src_port=flow["sport"],
-            dst_port=flow["dport"],
-        )
-
-        self.db.set_evidence(evidence)
-
-    def shutdown_gracefully(self):
-        # Confirm that the module is done processing
-        if self.mode == "train":
-            self.store_model()
-
-    def pre_main(self):
-        utils.drop_root_privs()
-        # Load the model
-        self.read_model()
-
-    def main(self):
-        if msg := self.get_msg("new_flow"):
-            # When a new flow arrives
-            msg = json.loads(msg["data"])
-            self.twid = msg["twid"]
-            self.profileid = msg["profileid"]
-            self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
-            # flow dict to have them
-            self.flow.update(
-                {
-                    "state": msg["interpreted_state"],
-                    "label": msg["label"],
-                    "module_labels": msg["module_labels"],
-                }
-            )
-
-            if self.mode == "train":
-                # We are training
-
-                # Is the amount in the DB of labels enough to retrain?
-                # Use labeled flows
-                labels = self.db.get_labels()
-                sum_labeled_flows = sum(i[1] for i in labels)
-
-                # The min labels to retrain is the min number of flows 
-                # we should have seen so far in this capture to start training
-                # This is so we dont _start_ training with only 1 flow
-
-                # Once we are over the start minimum, the second condition is 
-                # to force to retrain every a minimum_labels_to_retrain number
-                # of flows. So we dont retrain every 1 flow.
-                if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
-                ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
-                        # So for example we retrain every 50 labels and only when
-                        # we have at least 50 labels
-                        self.print(
-                            f"Training the model with the last group of "
-                            f"flows and labels. Total flows: {sum_labeled_flows}."
-                        )
-                        # Process all flows in the DB and make them ready
-                        # for pandas
-                        self.process_training_flows(self.last_number_of_flows_when_trained)
-                        # Train an algorithm
-                        self.train(sum_labeled_flows)
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
-
-            elif self.mode == "test":
-                # We are testing, which means using the model to detect
-                processed_flow = self.process_flow(self.flow)
-
-                # After processing the flow, it may happen that we
-                # delete icmp/arp/etc so the dataframe can be empty
-                if processed_flow is not None and not processed_flow.empty:
-                    # Predict
-                    pred: numpy.ndarray = self.detect(processed_flow)
-                    if not pred:
-                        # an error occurred
-                        return
-
-                    label = self.flow["label"]
-                    if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
-                        self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            3,
-                        )
-                    if pred[0] == "Malicious":
-                        # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, self.twid)
-                        self.print(
-                            f"Prediction {pred[0]} for label {label}"
-                            f' flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} -> '
-                            f'{self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            2,
-                        )
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+
+def process_file(file_path):
+    # Initialize the counters for the values
+    FPR_values = []
+    FNR_values = []
+    TNR_values = []
+    TPR_values = []
+    F1_values = []
+    accuracy_values = []
+    precision_values = []
+    MCC_values = []
+    recall_values = []
+    
+    # Read the file and extract the data
+    with open(file_path, 'r') as file:
+        for line in file:
+            if "TP:" in line:
+                # Extract the values from the line
+                parts = line.split(',')
+                TP = int(parts[0].split(':')[1].strip())
+                TN = int(parts[1].split(':')[1].strip())
+                FP = int(parts[2].split(':')[1].strip())
+                FN = int(parts[3].split(':')[1].strip())
+
+                # Calculate metrics
+                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
+                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
+                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
+                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
+                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
+                Recall = TPR  # Recall is the same as TPR
+                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
+                Accuracy = (TP + TN) / (TP + TN + FP + FN)
+                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
+                
+                # Append the values to the respective lists
+                FPR_values.append(FPR)
+                FNR_values.append(FNR)
+                TNR_values.append(TNR)
+                TPR_values.append(TPR)
+                F1_values.append(F1)
+                accuracy_values.append(Accuracy)
+                precision_values.append(Precision)
+                MCC_values.append(MCC)
+                recall_values.append(Recall)
+    
+    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
+
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+    # Separate the values into two groups based on their proximity to 0 or 1
+    close_to_0 = {
+        'FPR': [], 'FNR': []
+    }
+    close_to_1 = {
+        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
+    }
+    
+    # Categorize the metrics into two groups
+    for i in range(len(FPR_values)):
+        close_to_0['FPR'].append(FPR_values[i])
+        close_to_0['FNR'].append(FNR_values[i])
+        
+        close_to_1['TNR'].append(TNR_values[i])
+        close_to_1['TPR'].append(TPR_values[i])
+        close_to_1['F1'].append(F1_values[i])
+        close_to_1['accuracy'].append(accuracy_values[i])
+        close_to_1['precision'].append(precision_values[i])
+        close_to_1['MCC'].append(MCC_values[i])
+        close_to_1['recall'].append(recall_values[i])
+
+    # Plot metrics for values close to 0
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
+    
+    # Plot metrics for values close to 1
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+
+    # Print the final values
+    print("\nFinal Metric Values:")
+    print(f"Final FPR: {FPR_values[-1]:.4f}")
+    print(f"Final FNR: {FNR_values[-1]:.4f}")
+    print(f"Final TNR: {TNR_values[-1]:.4f}")
+    print(f"Final TPR: {TPR_values[-1]:.4f}")
+    print(f"Final F1 Score: {F1_values[-1]:.4f}")
+    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
+    print(f"Final Precision: {precision_values[-1]:.4f}")
+    print(f"Final MCC: {MCC_values[-1]:.4f}")
+    print(f"Final Recall: {recall_values[-1]:.4f}")
+
+def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
+    plt.figure(figsize=(12, 8))
+    
+    # Only plot the metrics that exist in the dictionary
+    if 'FPR' in metrics_dict:
+        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
+    if 'FNR' in metrics_dict:
+        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
+    if 'TNR' in metrics_dict:
+        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
+    if 'TPR' in metrics_dict:
+        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
+    if 'F1' in metrics_dict:
+        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
+    if 'accuracy' in metrics_dict:
+        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
+    if 'precision' in metrics_dict:
+        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
+    if 'MCC' in metrics_dict:
+        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
+    if 'recall' in metrics_dict:
+        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
+
+    # Apply log scale by default
+    plt.yscale('log')
+
+    # If the plot is close to 0, set custom ticks
+    if is_close_to_0:
+        # Add more ticks between 0 and 1 (using a logarithmic scale)
+        plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100'])
+
+    plt.xlabel('Index')
+    plt.ylabel('Metric Value')
+    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
+    plt.legend()
+    
+    # Save the plot
+    plt.savefig(output_filename)
+    plt.close()
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <file_path>")
+        sys.exit(1)
+    
+    file_path = sys.argv[1]
+    
+    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+
+if __name__ == "__main__":
+    main()

From 672a109958264697c25f80d7a25881c93752ce2e Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:20:52 +0000
Subject: [PATCH 331/455] Fix plots

---
 .../plot_testing_performance.py               | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 8f9e12cd86..69b8c96a8c 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['MCC'].append(MCC_values[i])
         close_to_1['recall'].append(recall_values[i])
 
-    # Plot metrics for values close to 0
+    # Plot metrics for values close to 0 (linear scale)
     plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
     
-    # Plot metrics for values close to 1
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+    # Plot metrics for values close to 1 (log scale)
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")
@@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     if 'recall' in metrics_dict:
         plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
 
-    # Apply log scale by default
-    plt.yscale('log')
+    # If the plot is close to 1, apply log scale
+    if not is_close_to_0:
+        plt.yscale('log')
 
-    # If the plot is close to 0, set custom ticks
+    # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series
     if is_close_to_0:
-        # Manually set more Y-ticks for better visibility
-        plt.ylim(0.0001, 1)  # Set Y-axis limits between 0.0001 and 1
-        plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1'])  # Adjust Y-ticks
+        min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR']))
+        max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR']))
+        
+        # Avoid log(0), so set the minimum limit a little higher than zero
+        if min_val == 0:
+            min_val = 1e-4  # Avoid zero values on the logarithmic scale
+
+        plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From aa87ed17add17251345579b8963bda7230043c6b Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:23:02 +0000
Subject: [PATCH 332/455] Fix plots

---
 modules/flowmldetection/plot_testing_performance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 69b8c96a8c..de4ada38b3 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
         
         # Avoid log(0), so set the minimum limit a little higher than zero
         if min_val == 0:
-            min_val = 1e-4  # Avoid zero values on the logarithmic scale
+            min_val = 1e-8  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From 148181f2d4f0d08df508dc85b545c5a18f2a6c3b Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:25:58 +0000
Subject: [PATCH 333/455] Change plot names

---
 modules/flowmldetection/plot_testing_performance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index de4ada38b3..1b4152c6eb 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0 (linear scale)
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
+    plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")

From 057beb3ae401f31c605fe6845957090faec1e195 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:26:09 +0000
Subject: [PATCH 334/455] Rename file

---
 .../{plot_train_score.py => plot_train_performance.py}          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename modules/flowmldetection/{plot_train_score.py => plot_train_performance.py} (97%)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_performance.py
similarity index 97%
rename from modules/flowmldetection/plot_train_score.py
rename to modules/flowmldetection/plot_train_performance.py
index 8437e968ac..80e13e9515 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -28,7 +28,7 @@ def plot_log_data(file_path):
 
     # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png')
+    plot_file = os.path.join(dir_name, 'performance_metrics_training.png')
 
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))

From f8aa2eb76ccca709d051497f7ca76b8316de4a47 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:31:32 +0000
Subject: [PATCH 335/455] Recover good flowmldetection deleted by mistake

---
 modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++-----
 1 file changed, 566 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 37f0761109..5e4e9aa462 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,143 +1,566 @@
-import matplotlib.pyplot as plt
-import sys
-import numpy as np
-
-def process_file(file_path):
-    # Initialize the counters for the values
-    FPR_values = []
-    FNR_values = []
-    TNR_values = []
-    TPR_values = []
-    F1_values = []
-    accuracy_values = []
-    precision_values = []
-    MCC_values = []
-    recall_values = []
-    
-    # Read the file and extract the data
-    with open(file_path, 'r') as file:
-        for line in file:
-            if "TP:" in line:
-                # Extract the values from the line
-                parts = line.split(',')
-                TP = int(parts[0].split(':')[1].strip())
-                TN = int(parts[1].split(':')[1].strip())
-                FP = int(parts[2].split(':')[1].strip())
-                FN = int(parts[3].split(':')[1].strip())
-
-                # Calculate metrics
-                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
-                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
-                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
-                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
-                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
-                Recall = TPR  # Recall is the same as TPR
-                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
-                Accuracy = (TP + TN) / (TP + TN + FP + FN)
-                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
-                
-                # Append the values to the respective lists
-                FPR_values.append(FPR)
-                FNR_values.append(FNR)
-                TNR_values.append(TNR)
-                TPR_values.append(TPR)
-                F1_values.append(F1)
-                accuracy_values.append(Accuracy)
-                precision_values.append(Precision)
-                MCC_values.append(MCC)
-                recall_values.append(Recall)
-    
-    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
-
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
-    # Separate the values into two groups based on their proximity to 0 or 1
-    close_to_0 = {
-        'FPR': [], 'FNR': []
-    }
-    close_to_1 = {
-        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
-    }
-    
-    # Categorize the metrics into two groups
-    for i in range(len(FPR_values)):
-        close_to_0['FPR'].append(FPR_values[i])
-        close_to_0['FNR'].append(FNR_values[i])
-        
-        close_to_1['TNR'].append(TNR_values[i])
-        close_to_1['TPR'].append(TPR_values[i])
-        close_to_1['F1'].append(F1_values[i])
-        close_to_1['accuracy'].append(accuracy_values[i])
-        close_to_1['precision'].append(precision_values[i])
-        close_to_1['MCC'].append(MCC_values[i])
-        close_to_1['recall'].append(recall_values[i])
-
-    # Plot metrics for values close to 0
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
-    
-    # Plot metrics for values close to 1
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
-
-    # Print the final values
-    print("\nFinal Metric Values:")
-    print(f"Final FPR: {FPR_values[-1]:.4f}")
-    print(f"Final FNR: {FNR_values[-1]:.4f}")
-    print(f"Final TNR: {TNR_values[-1]:.4f}")
-    print(f"Final TPR: {TPR_values[-1]:.4f}")
-    print(f"Final F1 Score: {F1_values[-1]:.4f}")
-    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
-    print(f"Final Precision: {precision_values[-1]:.4f}")
-    print(f"Final MCC: {MCC_values[-1]:.4f}")
-    print(f"Final Recall: {recall_values[-1]:.4f}")
-
-def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
-    plt.figure(figsize=(12, 8))
-    
-    # Only plot the metrics that exist in the dictionary
-    if 'FPR' in metrics_dict:
-        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
-    if 'FNR' in metrics_dict:
-        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
-    if 'TNR' in metrics_dict:
-        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
-    if 'TPR' in metrics_dict:
-        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
-    if 'F1' in metrics_dict:
-        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
-    if 'accuracy' in metrics_dict:
-        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
-    if 'precision' in metrics_dict:
-        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
-    if 'MCC' in metrics_dict:
-        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
-    if 'recall' in metrics_dict:
-        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
-
-    # Apply log scale by default
-    plt.yscale('log')
-
-    # If the plot is close to 0, set custom ticks
-    if is_close_to_0:
-        # Add more ticks between 0 and 1 (using a logarithmic scale)
-        plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100'])
-
-    plt.xlabel('Index')
-    plt.ylabel('Metric Value')
-    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
-    plt.legend()
-    
-    # Save the plot
-    plt.savefig(output_filename)
-    plt.close()
-
-def main():
-    if len(sys.argv) != 2:
-        print("Usage: python script.py <file_path>")
-        sys.exit(1)
-    
-    file_path = sys.argv[1]
-    
-    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
-
-if __name__ == "__main__":
-    main()
+# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
+from typing import Optional
+
+# SPDX-License-Identifier: GPL-2.0-only
+import numpy
+from sklearn.linear_model import SGDClassifier
+from sklearn.preprocessing import StandardScaler
+import pickle
+import pandas as pd
+import json
+import traceback
+import warnings
+import os
+
+from slips_files.common.parsers.config_parser import ConfigParser
+from slips_files.common.slips_utils import utils
+from slips_files.common.abstracts.module import IModule
+from slips_files.core.structures.evidence import (
+    Evidence,
+    ProfileID,
+    TimeWindow,
+    Attacker,
+    ThreatLevel,
+    EvidenceType,
+    IoCType,
+    Direction,
+    Victim,
+    Method,
+)
+
+# This horrible hack is only to stop sklearn from printing those warnings
+def warn(*args, **kwargs):
+    pass
+
+
+warnings.warn = warn
+
+
+class FlowMLDetection(IModule):
+    # Name: short name of the module. Do not use spaces
+    name = "Flow ML Detection"
+    description = (
+        "Train or test a Machine Learning model to detect malicious flows"
+    )
+    authors = ["Sebastian Garcia"]
+
+    def init(self):
+        # Subscribe to the channel
+        self.c1 = self.db.subscribe("new_flow")
+        self.channels = {"new_flow": self.c1}
+        self.fieldseparator = self.db.get_field_separator()
+        # Set the output queue of our database instance
+        # Read the configuration
+        self.read_configuration()
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained. Used internally only to know
+        # when to retrain
+        self.last_number_of_flows_when_trained = 0
+        # The scaler trained during training and to use during testing
+        self.scaler = StandardScaler()
+        self.model_path = "./modules/flowmldetection/model.bin"
+        self.scaler_path = "./modules/flowmldetection/scaler.bin"
+
+        # Initialize the training log file
+        self.training_log_path = "./modules/flowmldetection/training.log"
+        with open(self.training_log_path, "w") as log_file:
+            log_file.write("Training Log Initialized\n")
+
+    def read_configuration(self):
+        conf = ConfigParser()
+        self.mode = conf.get_ml_mode()
+        # This is the global label in the configuration,
+        # in case the flows do not have a label themselves
+        self.label = conf.label()
+
+    def write_to_training_log(self, message: str):
+        """
+        Write a message to the training log file.
+        """
+        try:
+            with open(self.training_log_path, "a") as log_file:
+                log_file.write(message + "\n")
+        except Exception as e:
+            self.print(f"Error writing to training log: {e}", 0, 1)
+
+    def train(self, sum_labeled_flows):
+        """
+        Train a model based on the flows we receive and the labels
+        """
+        try:
+            # Create X_flow with the current flows minus the label
+            X_flow = self.flows.drop("ground_truth_label", axis=1)
+            # Drop the detailed labels
+            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
+            # Drop the module_labels
+            X_flow = X_flow.drop("module_labels", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
+
+            # Normalize this batch of data so far. This can get progressivle slow
+            X_flow = self.scaler.fit_transform(X_flow)
+
+            # Count the number of labels of each type in this epoc
+            epoch_label_counts = {
+                "Background": (y_flow == "Background").sum(),
+                "Malicious": (y_flow == "Malicious").sum(),
+                "Benign": (y_flow == "Benign").sum(),
+            }
+
+            # Train
+            try:
+                # Online incremental learning
+                self.clf.partial_fit(
+                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
+                )
+            except Exception:
+                self.print("Error while calling clf.train()")
+                self.print(traceback.format_exc(), 0, 1)
+
+            # See score so far in training
+            score = self.clf.score(X_flow, y_flow)
+
+            #self.print(f"	Training Score: {score}", 1, 0)
+            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
+
+            # Store the models on disk
+            self.store_model()
+
+            # Log training information
+            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
+            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
+        except Exception:
+            self.print("Error in train().", 0, 1)
+            self.print(traceback.format_exc(), 0, 1)
+            self.write_to_training_log("Error occurred during training.")
+
+    def process_features(self, dataset):
+        """
+        Discards some features of the dataset and can create new.
+        Clean the dataset
+        """
+        try:
+            # Discard some type of flows that dont have ports
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
+            for proto in to_discard:
+                dataset = dataset[dataset.proto != proto]
+
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
+            # For now, discard these
+            to_drop = [
+                "appproto",
+                "daddr",
+                "saddr",
+                "starttime",
+                "type_",
+                "smac",
+                "dmac",
+                "history",
+                "uid",
+                "dir_",
+                "endtime",
+                "flow_source",
+            ]
+            for field in to_drop:
+                try:
+                    dataset = dataset.drop(field, axis=1)
+                except (ValueError, KeyError):
+                    pass
+
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
+            # So transform here
+            dataset["state"] = dataset.apply(
+                lambda row: self.db.get_final_state_from_flags(
+                    row["state"], (row["spkts"] + row["dpkts"])
+                ),
+                axis=1,
+            )
+
+            # Convert state to categorical
+            dataset.state = dataset.state.str.replace(
+                r"(^.*Not Established.*$)", "0", regex=True
+            )
+            dataset.state = dataset.state.str.replace(
+                r"(^.*Established.*$)", "1", regex=True
+            )
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
+            # We dont use the data to create categories because in testing mode
+            # we dont see all the protocols
+            # Also we dont store the Categorizer because the user can retrain
+            # with its own data.
+            dataset.proto = dataset.proto.str.lower()
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*tcp.*$)", "0", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*udp.*$)", "1", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*icmp.*$)", "2", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*icmp-ipv6.*$)", "3", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*arp.*$)", "4", regex=True
+            )
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
+                dataset.proto,
+                dataset.dport,
+                dataset.sport,
+                dataset.dur,
+                dataset.pkts,
+                dataset.spkts,
+                dataset.allbytes,
+                dataset.sbytes,
+                dataset.state,
+            ]
+            for field in fields_to_convert_to_float:
+                try:
+                    field = field.astype("float64")
+                except (ValueError, AttributeError):
+                    pass
+
+            return dataset
+        except Exception:
+            # Stop the timer
+            self.print("Error in process_features()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def process_training_flows(self, last_number_of_flows_when_trained):
+        """
+        Process only the new flows in the DB since the last training.
+        Store the pandas df in self.flows
+        """
+        try:
+            # Ensure the index is an integer
+            if last_number_of_flows_when_trained is None:
+                last_number_of_flows_when_trained = 0
+            else:
+                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+
+            # We get all the flows so far
+            flows = self.db.get_all_flows()
+            # Only process new flows since last training
+            new_flows = flows[last_number_of_flows_when_trained:]
+
+            # Check how many **different** labels are in the DB
+            labels = self.db.get_labels()
+            if len(labels) == 1:
+                # Insert fake flows for both classes if needed
+                new_flows.append(
+                    {
+                        "starttime": 1594417039.029793,
+                        "dur": "1.9424750804901123",
+                        "saddr": "10.7.10.101",
+                        "sport": "49733",
+                        "daddr": "40.70.224.145",
+                        "dport": "443",
+                        "proto": "tcp",
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
+                        "sbytes": 25517,
+                        "dbytes": 17247,
+                        "appproto": "ssl",
+                        "ground_truth_label": "Malicious",
+                        "module_labels": {
+                            "flowalerts-long-connection": "Malicious"
+                        },
+                    }
+                )
+                new_flows.append(
+                    {
+                        "starttime": 1382355032.706468,
+                        "dur": "10.896695",
+                        "saddr": "147.32.83.52",
+                        "sport": "47956",
+                        "daddr": "80.242.138.72",
+                        "dport": "80",
+                        "proto": "tcp",
+                        "state": "SF",
+                        "spkts": 1,
+                        "dpkts": 0,
+                        "sbytes": 100,
+                        "dbytes": 67596,
+                        "appproto": "http",
+                        "ground_truth_label": "Benign",
+                        "module_labels": {
+                            "flowalerts-long-connection": "Benign"
+                        },
+                    }
+                )
+
+            # Convert to pandas df
+            df_flows = pd.DataFrame(new_flows)
+
+            # Process features
+            df_flows = self.process_features(df_flows)
+
+            # Update the flow to the processed version
+            self.flows = df_flows
+        except Exception:
+            self.print("Error in process_flows()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def process_flow(self, flow_to_process: dict):
+        """
+        Process one flow. Only used during detection in testing
+        returns the pandas df with the processed flow
+        """
+        try:
+            # Convert the flow to a pandas dataframe
+            raw_flow = pd.DataFrame(flow_to_process, index=[0])
+            dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
+            # Update the flow to the processed version
+            return dflow
+        except Exception:
+            # Stop the timer
+            self.print("Error in process_flow()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def detect(self, x_flow) -> Optional[numpy.ndarray]:
+        """
+        Detects the given flow with the current model stored
+        and returns the predection array
+        """
+        try:
+            # clean the flow
+            fields_to_drop = [
+                "label",
+                "module_labels",
+                "uid",
+                "history",
+                "dir_",
+                "endtime",
+                "flow_source",
+                "ground_truth_label",
+                "detailed_ground_truth_label",
+            ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
+            for field in fields_to_drop:
+                try:
+                    x_flow = x_flow.drop(field, axis=1)
+                except (KeyError, ValueError):
+                    pass
+            # Scale the flow
+            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
+            pred: numpy.ndarray = self.clf.predict(x_flow)
+            return pred
+        except Exception as e:
+            self.print(
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+            )
+            self.print(traceback.format_exc(), 0, 1)
+
+    def store_model(self):
+        """
+        Store the trained model on disk
+        """
+        self.print("Storing the trained model and scaler on disk.", 0, 2)
+        with open(self.model_path, "wb") as f:
+            data = pickle.dumps(self.clf)
+            f.write(data)
+        with open(self.scaler_path, "wb") as g:
+            data = pickle.dumps(self.scaler)
+            g.write(data)
+
+    def read_model(self):
+        """
+        Read the trained model from disk
+        """
+        try:
+            self.print("Reading the trained model from disk.", 0, 2)
+            with open(self.model_path, "rb") as f:
+                self.clf = pickle.load(f)
+            self.print("Reading the trained scaler from disk.", 0, 2)
+            with open(self.scaler_path, "rb") as g:
+                self.scaler = pickle.load(g)
+        except FileNotFoundError:
+            # If there is no model, create one empty
+            self.print(
+                "There was no model. " "Creating a new empty model.", 0, 2
+            )
+            self.clf = SGDClassifier(
+                warm_start=True, loss="hinge", penalty="l1"
+            )
+        except EOFError:
+            self.print(
+                "Error reading model from disk. "
+                "Creating a new empty model.",
+                0,
+                2,
+            )
+            self.clf = SGDClassifier(
+                warm_start=True, loss="hinge", penalty="l1"
+            )
+
+    def set_evidence_malicious_flow(self, flow: dict, twid: str):
+        confidence: float = 0.1
+        description = (
+            f"Flow with malicious characteristics by ML. Src IP"
+            f" {flow['saddr']}:{flow['sport']} to "
+            f"{flow['daddr']}:{flow['dport']}"
+        )
+        twid_number = int(twid.replace("timewindow", ""))
+        evidence: Evidence = Evidence(
+            evidence_type=EvidenceType.MALICIOUS_FLOW,
+            attacker=Attacker(
+                direction=Direction.SRC,
+                ioc_type=IoCType.IP,
+                value=flow["saddr"],
+            ),
+            victim=Victim(
+                direction=Direction.DST,
+                ioc_type=IoCType.IP,
+                value=flow["daddr"],
+            ),
+            threat_level=ThreatLevel.LOW,
+            confidence=confidence,
+            description=description,
+            profile=ProfileID(ip=flow["saddr"]),
+            timewindow=TimeWindow(twid_number),
+            uid=[flow["uid"]],
+            timestamp=flow["starttime"],
+            method=Method.AI,
+            src_port=flow["sport"],
+            dst_port=flow["dport"],
+        )
+
+        self.db.set_evidence(evidence)
+
+    def shutdown_gracefully(self):
+        # Confirm that the module is done processing
+        if self.mode == "train":
+            self.store_model()
+
+    def pre_main(self):
+        utils.drop_root_privs()
+        # Load the model
+        self.read_model()
+
+    def main(self):
+        if msg := self.get_msg("new_flow"):
+            # When a new flow arrives
+            msg = json.loads(msg["data"])
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
+            self.flow = msg["flow"]
+            # These following extra fields are expected in testing. update the original
+            # flow dict to have them
+            self.flow.update(
+                {
+                    "state": msg["interpreted_state"],
+                    "label": msg["label"],
+                    "module_labels": msg["module_labels"],
+                }
+            )
+
+            if self.mode == "train":
+                # We are training
+
+                # Is the amount in the DB of labels enough to retrain?
+                # Use labeled flows
+                labels = self.db.get_labels()
+                sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
+                if (
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
+                ):
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows(self.last_number_of_flows_when_trained)
+                        # Train an algorithm
+                        self.train(sum_labeled_flows)
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
+            elif self.mode == "test":
+                # We are testing, which means using the model to detect
+                processed_flow = self.process_flow(self.flow)
+
+                # After processing the flow, it may happen that we
+                # delete icmp/arp/etc so the dataframe can be empty
+                if processed_flow is not None and not processed_flow.empty:
+                    # Predict
+                    pred: numpy.ndarray = self.detect(processed_flow)
+                    if not pred:
+                        # an error occurred
+                        return
+
+                    label = self.flow["label"]
+                    if label and label != "unknown" and label != pred[0]:
+                        # If the user specified a label in test mode,
+                        # and the label is diff from the prediction,
+                        # print in debug mode
+                        self.print(
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} ->'
+                            f' {self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
+                            0,
+                            3,
+                        )
+                    if pred[0] == "Malicious":
+                        # Generate an alert
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
+                        self.print(
+                            f"Prediction {pred[0]} for label {label}"
+                            f' flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} -> '
+                            f'{self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
+                            0,
+                            2,
+                        )
\ No newline at end of file

From f53d7e6c8528af2bf011039e37324b1249bfbaa8 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:31:43 +0000
Subject: [PATCH 336/455] Fix plot test

---
 modules/flowmldetection/plot_testing_performance.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 1b4152c6eb..977a68b2d5 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")
@@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
         
         # Avoid log(0), so set the minimum limit a little higher than zero
         if min_val == 0:
-            min_val = 1e-8  # Avoid zero values on the logarithmic scale
+            min_val = 1e-4  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From 6a2c1379d07b8d65b1e9fbbd3c6c64061723f8b7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:50:33 +0000
Subject: [PATCH 337/455] Add testing code to evaluate performance. It is
 optional with a varible

---
 modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++-------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 5e4e9aa462..b17a1baaf0 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -526,36 +526,21 @@ def main(self):
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
-
                 # After processing the flow, it may happen that we
                 # delete icmp/arp/etc so the dataframe can be empty
                 if processed_flow is not None and not processed_flow.empty:
+                    original_label = processed_flow["ground_truth_label"].iloc[0]
                     # Predict
                     pred: numpy.ndarray = self.detect(processed_flow)
                     if not pred:
                         # an error occurred
                         return
 
-                    label = self.flow["label"]
-                    if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
-                        self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            3,
-                        )
                     if pred[0] == "Malicious":
                         # Generate an alert
                         self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
-                            f"Prediction {pred[0]} for label {label}"
+                            f"Prediction {pred[0]} for label {original_label}"
                             f' flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} -> '
                             f'{self.flow["daddr"]}:'
@@ -563,4 +548,43 @@ def main(self):
                             f'{self.flow["proto"]}',
                             0,
                             2,
-                        )
\ No newline at end of file
+                        )
+
+                    # So you can disable this code easily. Since it is used only for evaluating a testing
+                    log_testing_data = True
+                    if log_testing_data:
+                        # Initialize counters if not already done
+                        if not hasattr(self, 'tp'):
+                            self.tp = 0
+                        if not hasattr(self, 'tn'):
+                            self.tn = 0
+                        if not hasattr(self, 'fp'):
+                            self.fp = 0
+                        if not hasattr(self, 'fn'):
+                            self.fn = 0
+
+
+                        # Update counters based on predictions and labels
+                        if pred[0] == "Malicious" and original_label == "Malicious":
+                            self.tp += 1
+                        elif pred[0] == "Benign" and original_label == "Benign":
+                            self.tn += 1
+                        elif pred[0] == "Malicious" and original_label == "Benign":
+                            self.fp += 1
+                        elif pred[0] == "Benign" and original_label == "Malicious":
+                            self.fn += 1
+
+                        testing_log_path = "./modules/flowmldetection/testing_performance.log"
+                        try:
+                            with open(testing_log_path, "a") as log_file:
+                                log_file.write("Testing Performance Log Initialized\n")
+                                # Log the testing performance metrics
+                                log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n")
+
+                                # Log the original flow for false positives and false negatives
+                                if pred[0] == "Malicious" and original_label == "Benign":
+                                    log_file.write(f"False Positive Flow: {self.flow}\n")
+                                elif pred[0] == "Benign" and original_label == "Malicious":
+                                    log_file.write(f"False Negative Flow: {self.flow}\n")
+                        except Exception as e:
+                            self.print(f"Error initializing testing performance log: {e}", 0, 1)
\ No newline at end of file

From 9fd5cff376977d9a4d970033c5e824d80fed51a6 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 19:04:00 +0000
Subject: [PATCH 338/455] Fix plots

---
 .../plot_testing_performance.py               | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 977a68b2d5..6865415cdf 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 import sys
 import numpy as np
+import argparse
 
 def process_file(file_path):
     # Initialize the counters for the values
@@ -49,7 +50,7 @@ def process_file(file_path):
     
     return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
 
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number):
     # Separate the values into two groups based on their proximity to 0 or 1
     close_to_0 = {
         'FPR': [], 'FNR': []
@@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0 (linear scale)
-    plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
+    plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False)
 
     # Print the final values
-    print("\nFinal Metric Values:")
+    print("\nFinal Metric Values for Experiment", experiment_number)
     print(f"Final FPR: {FPR_values[-1]:.4f}")
     print(f"Final FNR: {FNR_values[-1]:.4f}")
     print(f"Final TNR: {TNR_values[-1]:.4f}")
@@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     print(f"Final MCC: {MCC_values[-1]:.4f}")
     print(f"Final Recall: {recall_values[-1]:.4f}")
 
-def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
+def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False):
     plt.figure(figsize=(12, 8))
     
     # Only plot the metrics that exist in the dictionary
@@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
             min_val = 1e-4  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
 
+    # Add the experiment number to the plot title
     plt.xlabel('Index')
     plt.ylabel('Metric Value')
-    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
+    plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time')
     plt.legend()
     
     # Save the plot
@@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     plt.close()
 
 def main():
-    if len(sys.argv) != 2:
-        print("Usage: python script.py <file_path>")
-        sys.exit(1)
+    # Set up argument parsing
+    parser = argparse.ArgumentParser(description='Plot testing performance metrics.')
+    parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file')
+    parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number')
+
+    args = parser.parse_args()
     
-    file_path = sys.argv[1]
+    file_path = args.file
+    experiment_number = args.experiment
     
     FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number)
 
 if __name__ == "__main__":
     main()

From 3b88f410d4eebf2c8bc5cc7fc8056756d18d5e73 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 19:14:51 +0000
Subject: [PATCH 339/455] Fix train plot

---
 .../flowmldetection/plot_train_performance.py | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 80e13e9515..244df13d28 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -5,7 +5,7 @@
 import argparse
 import os
 
-def plot_log_data(file_path):
+def plot_log_data(file_path, experiment_number):
     # Read the log data from the file
     with open(file_path, 'r') as file:
         log_data = file.read()
@@ -28,7 +28,8 @@ def plot_log_data(file_path):
 
     # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    plot_file = os.path.join(dir_name, 'performance_metrics_training.png')
+    # Append experiment number to the filename
+    plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png')
 
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))
@@ -55,18 +56,18 @@ def plot_log_data(file_path):
     for i, value in enumerate(df["Total labels"]):
         ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
 
-    # Adding title and legend
-    plt.title('Training performance')
+    # Adding title and legend with experiment number in title
+    plt.title(f'Training performance - Experiment {experiment_number}')
     fig.tight_layout()
 
     # Move both legends further to the right
-    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
-    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1)
 
     # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.7)
+    plt.subplots_adjust(right=0.75)
 
-    # Save plot to the same folder as the log file
+    # Save plot to the same folder as the log file with experiment number in filename
     plt.savefig(plot_file)
 
     # Display the plot
@@ -75,13 +76,14 @@ def plot_log_data(file_path):
 def main():
     # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
-    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
+    parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file")
+    parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename")
     
     # Handle -h / --help
     args = parser.parse_args()
 
     # Call the function to process the log file
-    plot_log_data(args.log_file)
+    plot_log_data(args.file, args.experiment)
 
 if __name__ == "__main__":
     main()

From 9e683fa5a09f6e25d7bc4cd09a382c999400e85b Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 21:14:48 +0000
Subject: [PATCH 340/455] Fix plots

---
 .../flowmldetection/plot_train_performance.py | 122 ++++++++++--------
 1 file changed, 71 insertions(+), 51 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 244df13d28..5212dfeeaf 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -4,85 +4,105 @@
 import sys
 import argparse
 import os
+import matplotlib.ticker as ticker
 
 def plot_log_data(file_path, experiment_number):
     # Read the log data from the file
     with open(file_path, 'r') as file:
         log_data = file.read()
 
-    # Define regex pattern to extract relevant data from each line
-    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
+    # Regex pattern for the new log format
+    pattern = (
+        r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: "
+        r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), "
+        r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\."
+    )
 
     # Parse the log file
     data = re.findall(pattern, log_data)
 
     # Convert data to a DataFrame
-    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
+    columns = [
+        "Total labels", "Background", "Benign", "Malicious",
+        "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"
+    ]
+    df = pd.DataFrame(data, columns=columns)
     df = df.astype({
+        "Total labels": float,
         "Background": int,
         "Benign": int,
         "Malicious": int,
-        "Total labels": float,
-        "Score": float
+        "FPR": float,
+        "TNR": float,
+        "TPR": float,
+        "FNR": float,
+        "F1": float,
+        "Precision": float,
+        "Accuracy": float,
+        "MCC": float,
+        "Recall": float,
     })
 
-    # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    # Append experiment number to the filename
-    plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png')
-
-    # Plotting the values
-    fig, ax1 = plt.subplots(figsize=(10, 6))
 
-    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
-    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
+    # --- Plot 1: Number of labels (linear scale, no total labels) ---
+    fig1, ax1 = plt.subplots(figsize=(10, 6))
+    ax1.plot(df.index, df["Background"], label="Background", color='black')
+    ax1.plot(df.index, df["Benign"], label="Benign", color='cyan')
+    ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta')
     ax1.set_xlabel('Index')
-    ax1.set_ylabel('Score', color='tab:blue')
-    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
-    ax1.tick_params(axis='y', labelcolor='tab:blue')
-
-    # Create the second y-axis for the Background, Benign, Malicious
-    ax2 = ax1.twinx()
-    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
-    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
-    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
-    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
-    
-    # Set appropriate scale for right y-axis based on the data
-    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
-    ax2.tick_params(axis='y', labelcolor='tab:red')
-
-    # Annotating Total labels as text on the plot
-    for i, value in enumerate(df["Total labels"]):
-        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
-
-    # Adding title and legend with experiment number in title
-    plt.title(f'Training performance - Experiment {experiment_number}')
-    fig.tight_layout()
+    ax1.set_ylabel('Label Counts')
+    # No log scale here
+    ax1.set_title(f'Label Counts - Experiment {experiment_number}')
+    ax1.legend()
+    ax1.yaxis.set_major_locator(ticker.MaxNLocator(70))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png'))
+
+    # --- Plot 2: FNR and FPR (log scale) ---
+    fig2, ax2 = plt.subplots(figsize=(10, 6))
+    ax2.plot(df.index, df["FNR"], label="FNR", color='red')
+    ax2.plot(df.index, df["FPR"], label="FPR", color='blue')
+    ax2.set_xlabel('Index')
+    ax2.set_ylabel('Rate')
+    ax2.set_yscale('log')
+    ax2.set_title(f'FNR and FPR - Experiment {experiment_number}')
+    ax2.legend()
+    ax2.yaxis.set_major_locator(ticker.MaxNLocator(100))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png'))
+
+    # --- Plot 3: Other metrics (log scale) ---
+    fig3, ax3 = plt.subplots(figsize=(12, 7))
+    metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"]
+    colors_rest = [
+        'tab:blue', 'tab:green', 'tab:purple', 'tab:brown',
+        'tab:gray', 'tab:pink', 'tab:olive'
+    ]
+    for metric, color in zip(metrics_rest, colors_rest):
+        ax3.plot(df.index, df[metric], label=metric, color=color)
+    ax3.set_xlabel('Index')
+    ax3.set_ylabel('Metric Value')
+    ax3.set_yscale('log')
+    ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}')
+    ax3.legend()
+    ax3.yaxis.set_major_locator(ticker.MaxNLocator(50))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png'))
 
-    # Move both legends further to the right
-    ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1)
-    ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1)
-
-    # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.75)
-
-    # Save plot to the same folder as the log file with experiment number in filename
-    plt.savefig(plot_file)
-
-    # Display the plot
     plt.show()
 
+    # --- Print final values in terminal ---
+    print("\nFinal values at last training step:")
+    for col in ["Total labels", "Background", "Benign", "Malicious",
+                "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]:
+        print(f"{col}: {df[col].iloc[-1]}")
+
 def main():
-    # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
     parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file")
     parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename")
-    
-    # Handle -h / --help
     args = parser.parse_args()
-
-    # Call the function to process the log file
     plot_log_data(args.file, args.experiment)
 
 if __name__ == "__main__":

From 632ddbcd650375a5b6a41d0bb724c20fd3766e4f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 21:16:01 +0000
Subject: [PATCH 341/455] Add performance metrics to the training evaluation

---
 modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++-----
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index b17a1baaf0..2c60cd4034 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,7 +10,16 @@
 import json
 import traceback
 import warnings
-import os
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.metrics import (
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    accuracy_score,
+    matthews_corrcoef,
+    recall_score,
+)
+
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -86,21 +95,21 @@ def write_to_training_log(self, message: str):
         except Exception as e:
             self.print(f"Error writing to training log: {e}", 0, 1)
 
-    def train(self, sum_labeled_flows):
+    def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
+            # Create y_flow with the label
+            y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label)
             # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("ground_truth_label", axis=1)
             # Drop the detailed labels
             X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
             # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
 
-            # Normalize this batch of data so far. This can get progressivle slow
+            # Normalize this batch of data so far. This can get progressively slow
             X_flow = self.scaler.fit_transform(X_flow)
 
             # Count the number of labels of each type in this epoc
@@ -120,18 +129,43 @@ def train(self, sum_labeled_flows):
                 self.print("Error while calling clf.train()")
                 self.print(traceback.format_exc(), 0, 1)
 
-            # See score so far in training
-            score = self.clf.score(X_flow, y_flow)
+            # Predict on the training data
+            y_pred = self.clf.predict(X_flow)
 
-            #self.print(f"	Training Score: {score}", 1, 0)
-            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
+            # For metrics, let's focus on Malicious vs Benign (ignore Background)
+            mask = (y_flow == "Malicious") | (y_flow == "Benign")
+            y_true_bin = y_flow[mask]
+            y_pred_bin = y_pred[mask]
+
+            # Map to binary: Malicious=1, Benign=0
+            y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0)
+            y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0)
+
+            # Compute confusion matrix: tn, fp, fn, tp
+            tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0)
+
+            # Compute metrics
+            FPR = fp / (fp + tn) if (fp + tn) > 0 else 0
+            TNR = tn / (tn + fp) if (tn + fp) > 0 else 0
+            TPR = tp / (tp + fn) if (tp + fn) > 0 else 0
+            FNR = fn / (fn + tp) if (fn + tp) > 0 else 0
+            F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0)
+            PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0)
+            ACCU = accuracy_score(y_true_bin, y_pred_bin)
+            MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0
+            RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0)
 
             # Store the models on disk
             self.store_model()
 
             # Log training information
-            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
-            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
+            self.write_to_training_log(
+                f"Total labels: {sum_labeled_flows}, "
+                f"Background: {epoch_label_counts['Background']}. "
+                f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. "
+                f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, "
+                f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}."
+            )
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
@@ -520,7 +554,7 @@ def main(self):
                         # for pandas
                         self.process_training_flows(self.last_number_of_flows_when_trained)
                         # Train an algorithm
-                        self.train(sum_labeled_flows)
+                        self.train(sum_labeled_flows, self.last_number_of_flows_when_trained)
                         self.last_number_of_flows_when_trained = sum_labeled_flows
 
             elif self.mode == "test":

From 1d3346dbeb3653238427b291b9b8d90e01a2f578 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sun, 4 May 2025 12:50:46 +0000
Subject: [PATCH 342/455] Fix experiment names

---
 modules/flowmldetection/plot_train_performance.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 5212dfeeaf..304f0f4ead 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number):
     ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta')
     ax1.set_xlabel('Index')
     ax1.set_ylabel('Label Counts')
-    # No log scale here
     ax1.set_title(f'Label Counts - Experiment {experiment_number}')
     ax1.legend()
     ax1.yaxis.set_major_locator(ticker.MaxNLocator(70))
+    ax1.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png'))
 
     # --- Plot 2: FNR and FPR (log scale) ---
     fig2, ax2 = plt.subplots(figsize=(10, 6))
@@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number):
     ax2.set_title(f'FNR and FPR - Experiment {experiment_number}')
     ax2.legend()
     ax2.yaxis.set_major_locator(ticker.MaxNLocator(100))
+    ax2.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png'))
 
     # --- Plot 3: Other metrics (log scale) ---
     fig3, ax3 = plt.subplots(figsize=(12, 7))
@@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number):
     ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}')
     ax3.legend()
     ax3.yaxis.set_major_locator(ticker.MaxNLocator(50))
+    ax3.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png'))
 
     plt.show()
 

From 36129e51da4879ee590f2c76ad502372fb6954e7 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 343/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 480 +++++++++++----------
 1 file changed, 254 insertions(+), 226 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 2c60cd4034..16b67e9038 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,16 +10,7 @@
 import json
 import traceback
 import warnings
-from sklearn.metrics import classification_report, confusion_matrix
-from sklearn.metrics import (
-    confusion_matrix,
-    f1_score,
-    precision_score,
-    accuracy_score,
-    matthews_corrcoef,
-    recall_score,
-)
-
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -37,6 +28,10 @@
     Method,
 )
 
+# Only for debbuging
+# from matplotlib import pyplot as plt
+
+
 # This horrible hack is only to stop sklearn from printing those warnings
 def warn(*args, **kwargs):
     pass
@@ -61,115 +56,206 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new labels needed to start the train
-        self.minimum_labels_to_start_train = 50
-        # Minum amount of new labels needed to retrain
-        self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained. Used internally only to know
-        # when to retrain
-        self.last_number_of_flows_when_trained = 0
+        # Minum amount of new lables needed to trigger the train
+        self.minimum_lables_to_retrain = 50
+        # To plot the scores of training
+        # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
         self.model_path = "./modules/flowmldetection/model.bin"
         self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
-        # Initialize the training log file
-        self.training_log_path = "./modules/flowmldetection/training.log"
-        with open(self.training_log_path, "w") as log_file:
-            log_file.write("Training Log Initialized\n")
-
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
-        # This is the global label in the configuration,
-        # in case the flows do not have a label themselves
-        self.label = conf.label()
-
-    def write_to_training_log(self, message: str):
-        """
-        Write a message to the training log file.
-        """
-        try:
-            with open(self.training_log_path, "a") as log_file:
-                log_file.write(message + "\n")
-        except Exception as e:
-            self.print(f"Error writing to training log: {e}", 0, 1)
 
-    def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
+    def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Create y_flow with the label
-            y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label)
-            # Create X_flow with the current flows minus the label
-            X_flow = self.flows.drop("ground_truth_label", axis=1)
-            # Drop the detailed labels
-            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
-            # Drop the module_labels
+            # Process the labels to have only Normal and Malware
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*ormal.*$)", "Normal", regex=True
+            )
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*alware.*$)", "Malware", regex=True
+            )
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*alicious.*$)", "Malware", regex=True
+            )
+
+            # Separate
+            y_flow = self.flows["label"]
+            X_flow = self.flows.drop("label", axis=1)
             X_flow = X_flow.drop("module_labels", axis=1)
 
-            # Normalize this batch of data so far. This can get progressively slow
+            # Normalize this batch of data so far. This can get progressivle slow
             X_flow = self.scaler.fit_transform(X_flow)
 
-            # Count the number of labels of each type in this epoc
-            epoch_label_counts = {
-                "Background": (y_flow == "Background").sum(),
-                "Malicious": (y_flow == "Malicious").sum(),
-                "Benign": (y_flow == "Benign").sum(),
-            }
-
             # Train
             try:
-                # Online incremental learning
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
+                    X_flow, y_flow, classes=["Malware", "Normal"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
                 self.print(traceback.format_exc(), 0, 1)
 
-            # Predict on the training data
-            y_pred = self.clf.predict(X_flow)
+            # See score so far in training
+            score = self.clf.score(X_flow, y_flow)
 
-            # For metrics, let's focus on Malicious vs Benign (ignore Background)
-            mask = (y_flow == "Malicious") | (y_flow == "Benign")
-            y_true_bin = y_flow[mask]
-            y_pred_bin = y_pred[mask]
+            # To debug the training score
+            # self.scores.append(score)
 
-            # Map to binary: Malicious=1, Benign=0
-            y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0)
-            y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0)
+            self.print(f"	Training Score: {score}", 0, 1)
+            # self.print(f'    Model Parameters: {self.clf.coef_}')
 
-            # Compute confusion matrix: tn, fp, fn, tp
-            tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0)
-
-            # Compute metrics
-            FPR = fp / (fp + tn) if (fp + tn) > 0 else 0
-            TNR = tn / (tn + fp) if (tn + fp) > 0 else 0
-            TPR = tp / (tp + fn) if (tp + fn) > 0 else 0
-            FNR = fn / (fn + tp) if (fn + tp) > 0 else 0
-            F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0)
-            PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0)
-            ACCU = accuracy_score(y_true_bin, y_pred_bin)
-            MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0
-            RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0)
+            # Debug code to store a plot in a png of the scores
+            # plt.plot(self.scores)
+            # plt.savefig('train-scores.png')
 
             # Store the models on disk
             self.store_model()
 
-            # Log training information
-            self.write_to_training_log(
-                f"Total labels: {sum_labeled_flows}, "
-                f"Background: {epoch_label_counts['Background']}. "
-                f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. "
-                f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, "
-                f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}."
-            )
         except Exception:
-            self.print("Error in train().", 0, 1)
+            self.print("Error in train()", 0, 1)
+            self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
             self.print(traceback.format_exc(), 0, 1)
-            self.write_to_training_log("Error occurred during training.")
 
     def process_features(self, dataset):
         """
@@ -182,11 +268,6 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
-            if dataset.empty:
-                # DataFrame is empty now, so return empty
-                return dataset
-
             # For now, discard these
             to_drop = [
                 "appproto",
@@ -199,7 +280,9 @@ def process_features(self, dataset):
                 "history",
                 "uid",
                 "dir_",
+                "dbytes",
                 "endtime",
+                "bytes",
                 "flow_source",
             ]
             for field in to_drop:
@@ -208,16 +291,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: self.db.get_final_state_from_flags(
-                    row["state"], (row["spkts"] + row["dpkts"])
-                ),
-                axis=1,
-            )
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -251,11 +330,7 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-
-            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
-            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
-
-            fields_to_convert_to_float = [
+            fields_to_convert_to_flow = [
                 dataset.proto,
                 dataset.dport,
                 dataset.sport,
@@ -266,10 +341,10 @@ def process_features(self, dataset):
                 dataset.sbytes,
                 dataset.state,
             ]
-            for field in fields_to_convert_to_float:
+            for field in fields_to_convert_to_flow:
                 try:
                     field = field.astype("float64")
-                except (ValueError, AttributeError):
+                except ValueError:
                     pass
 
             return dataset
@@ -278,72 +353,69 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_training_flows(self, last_number_of_flows_when_trained):
+    def process_flows(self):
         """
-        Process only the new flows in the DB since the last training.
+        Process all the flwos in the DB
         Store the pandas df in self.flows
         """
         try:
-            # Ensure the index is an integer
-            if last_number_of_flows_when_trained is None:
-                last_number_of_flows_when_trained = 0
-            else:
-                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
-
             # We get all the flows so far
+            # because this retraining happens in batches
             flows = self.db.get_all_flows()
-            # Only process new flows since last training
-            new_flows = flows[last_number_of_flows_when_trained:]
-
-            # Check how many **different** labels are in the DB
+            # Check how many different labels are in the DB
+            # We need both normal and malware
             labels = self.db.get_labels()
             if len(labels) == 1:
-                # Insert fake flows for both classes if needed
-                new_flows.append(
+                # Only 1 label has flows
+                # There are not enough different labels, so insert two flows
+                # that are fake but representative of a normal and malware flow
+                # they are only for the training process
+                # At least 1 flow of each label is required
+                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
+                flows.append(
                     {
-                        "starttime": 1594417039.029793,
+                        "ts": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 17,
-                        "dpkts": 27,
+                        "state": "Established",
+                        "allbytes": 42764,
+                        "spkts": 37,
                         "sbytes": 25517,
-                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "ground_truth_label": "Malicious",
+                        "label": "Malware",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
+                            "flowalerts-long-connection": "Malware"
                         },
                     }
                 )
-                new_flows.append(
+                flows.append(
                     {
-                        "starttime": 1382355032.706468,
+                        "ts": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "state": "SF",
+                        "state": "Established",
+                        "allbytes": 67696,
                         "spkts": 1,
-                        "dpkts": 0,
                         "sbytes": 100,
-                        "dbytes": 67596,
                         "appproto": "http",
-                        "ground_truth_label": "Benign",
+                        "label": "Normal",
                         "module_labels": {
-                            "flowalerts-long-connection": "Benign"
+                            "flowalerts-long-connection": "Normal"
                         },
                     }
                 )
+                # If there are enough flows, we dont insert them anymore
 
             # Convert to pandas df
-            df_flows = pd.DataFrame(new_flows)
+            df_flows = pd.DataFrame(flows)
 
             # Process features
             df_flows = self.process_features(df_flows)
@@ -351,6 +423,7 @@ def process_training_flows(self, last_number_of_flows_when_trained):
             # Update the flow to the processed version
             self.flows = df_flows
         except Exception:
+            # Stop the timer
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 
@@ -363,8 +436,6 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
-            if dflow.empty:
-                return None
             # Update the flow to the processed version
             return dflow
         except Exception:
@@ -378,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
         and returns the predection array
         """
         try:
+            given_x_flow = x_flow
             # clean the flow
             fields_to_drop = [
                 "label",
@@ -385,28 +457,14 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "uid",
                 "history",
                 "dir_",
+                "dbytes",
+                "dpkts",
                 "endtime",
+                "bytes",
                 "flow_source",
-                "ground_truth_label",
+                "ground_truth_label",  # todo now we can use them
                 "detailed_ground_truth_label",
             ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
-            # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
-
-            # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)
@@ -418,7 +476,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             return pred
         except Exception as e:
             self.print(
-                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
             )
             self.print(traceback.format_exc(), 0, 1)
 
@@ -510,16 +568,18 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            # When a new flow arrives
             msg = json.loads(msg["data"])
-            self.twid = msg["twid"]
-            self.profileid = msg["profileid"]
+            twid = msg["twid"]
             self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
+            # these fields are expected in testing. update the original
             # flow dict to have them
             self.flow.update(
                 {
+                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
+                    # the flow["state"] is the origstate, we dont need that here
+                    # we need the interpreted state
                     "state": msg["interpreted_state"],
+                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
                     "label": msg["label"],
                     "module_labels": msg["module_labels"],
                 }
@@ -532,49 +592,56 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
-
-                # The min labels to retrain is the min number of flows 
-                # we should have seen so far in this capture to start training
-                # This is so we dont _start_ training with only 1 flow
-
-                # Once we are over the start minimum, the second condition is 
-                # to force to retrain every a minimum_labels_to_retrain number
-                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
+                    sum_labeled_flows >= self.minimum_lables_to_retrain
+                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
                 ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
-                        # So for example we retrain every 50 labels and only when
-                        # we have at least 50 labels
-                        self.print(
-                            f"Training the model with the last group of "
-                            f"flows and labels. Total flows: {sum_labeled_flows}."
-                        )
-                        # Process all flows in the DB and make them ready
-                        # for pandas
-                        self.process_training_flows(self.last_number_of_flows_when_trained)
-                        # Train an algorithm
-                        self.train(sum_labeled_flows, self.last_number_of_flows_when_trained)
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
-
+                    # We get here every 'self.minimum_lables_to_retrain'
+                    # amount of labels
+                    # So for example we retrain every 100 labels and only when
+                    # we have at least 100 labels
+                    self.print(
+                        f"Training the model with the last group of "
+                        f"flows and labels. Total flows: {sum_labeled_flows}."
+                    )
+                    # Process all flows in the DB and make them ready
+                    # for pandas
+                    self.process_flows()
+                    # Train an algorithm
+                    self.train()
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
+
                 # After processing the flow, it may happen that we
                 # delete icmp/arp/etc so the dataframe can be empty
                 if processed_flow is not None and not processed_flow.empty:
-                    original_label = processed_flow["ground_truth_label"].iloc[0]
                     # Predict
                     pred: numpy.ndarray = self.detect(processed_flow)
                     if not pred:
                         # an error occurred
                         return
 
-                    if pred[0] == "Malicious":
+                    label = self.flow["label"]
+                    if label and label != "unknown" and label != pred[0]:
+                        # If the user specified a label in test mode,
+                        # and the label is diff from the prediction,
+                        # print in debug mode
+                        self.print(
+                            f"Report Prediction {pred[0]} for label"
+                            f' {label} flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} ->'
+                            f' {self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
+                            0,
+                            3,
+                        )
+                    if pred[0] == "Malware":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, self.twid)
+                        self.set_evidence_malicious_flow(self.flow, twid)
                         self.print(
-                            f"Prediction {pred[0]} for label {original_label}"
+                            f"Prediction {pred[0]} for label {label}"
                             f' flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} -> '
                             f'{self.flow["daddr"]}:'
@@ -583,42 +650,3 @@ def main(self):
                             0,
                             2,
                         )
-
-                    # So you can disable this code easily. Since it is used only for evaluating a testing
-                    log_testing_data = True
-                    if log_testing_data:
-                        # Initialize counters if not already done
-                        if not hasattr(self, 'tp'):
-                            self.tp = 0
-                        if not hasattr(self, 'tn'):
-                            self.tn = 0
-                        if not hasattr(self, 'fp'):
-                            self.fp = 0
-                        if not hasattr(self, 'fn'):
-                            self.fn = 0
-
-
-                        # Update counters based on predictions and labels
-                        if pred[0] == "Malicious" and original_label == "Malicious":
-                            self.tp += 1
-                        elif pred[0] == "Benign" and original_label == "Benign":
-                            self.tn += 1
-                        elif pred[0] == "Malicious" and original_label == "Benign":
-                            self.fp += 1
-                        elif pred[0] == "Benign" and original_label == "Malicious":
-                            self.fn += 1
-
-                        testing_log_path = "./modules/flowmldetection/testing_performance.log"
-                        try:
-                            with open(testing_log_path, "a") as log_file:
-                                log_file.write("Testing Performance Log Initialized\n")
-                                # Log the testing performance metrics
-                                log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n")
-
-                                # Log the original flow for false positives and false negatives
-                                if pred[0] == "Malicious" and original_label == "Benign":
-                                    log_file.write(f"False Positive Flow: {self.flow}\n")
-                                elif pred[0] == "Benign" and original_label == "Malicious":
-                                    log_file.write(f"False Negative Flow: {self.flow}\n")
-                        except Exception as e:
-                            self.print(f"Error initializing testing performance log: {e}", 0, 1)
\ No newline at end of file

From a9a38be1d23ebb45330d8bc616c9701c5181db61 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 344/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 96e0e65f772b4d7542b762fb500b73aff90b262b Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 345/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From 5d655d2d2d16440bc9bf6eb07262cbbba7bddb3d Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 346/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 16b67e9038..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -291,12 +156,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 1d339685f8..568e78ff45 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 8cd019f174817eee464c90c05ba2a3d60365a852 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 347/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From fdfd7fa0e06079e258530995ee65436f0f56bbf9 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 348/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 5a5b751e2a4491b5cac57dfe3be26643d9d19b26 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 349/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 2400ee226cf7d7678e06570988af29782c1eec10 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 350/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 457cf59da0f4e4be130f661a5eefb01b01c238d4 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 351/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From c35018ef7db18a9cb3b8facaee69b1dc3ec58479 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 352/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 311e8de82f933c87f1d079613ebf2c8fd5e1a5c9 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 353/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 75bb4ea33838004df0241d5c68561b77f642e3de Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 354/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 6be900429ac675632a0d35e137f45bcb025a12f1 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 355/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..f052931c89 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From e08f2903f4a43ae0ccdbce860e8e0639525ad2f7 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 356/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f052931c89..3379f5077f 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 5de25cdb8e5b0d027fbc3df2f8f0467c2a53d489 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:23:29 +0100
Subject: [PATCH 357/455] Re add function that alya added

---
 slips_files/core/database/database_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 568e78ff45..1d339685f8 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
+    def get_final_state_from_flags(self, *args, **kwargs):
+        return self.rdb.get_final_state_from_flags(*args, **kwargs)
+
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 2b614c84fb077b37ecff4613981bc5e7bc031574 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:25:03 +0100
Subject: [PATCH 358/455] delete sys

---
 modules/flowmldetection/flowmldetection.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 3379f5077f..c06755a599 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,7 +10,6 @@
 import json
 import traceback
 import warnings
-import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils

From 7bce2ca4fc01178dddafb04b4dcb64a8295e142c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:27:23 +0100
Subject: [PATCH 359/455] Delete file that was deleted from develop

---
 slips_files/common/state_handler.py | 170 ----------------------------
 1 file changed, 170 deletions(-)
 delete mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
deleted file mode 100644
index d0a05115bd..0000000000
--- a/slips_files/common/state_handler.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Optional
-
-
-def interpret_suricata_states(state) -> Optional[str]:
-    """
-    There are different states in which a flow can be.
-    Suricata distinguishes three flow-states for TCP and two for
-     UDP. For TCP,
-    these are: New, Established and Closed,for UDP only new and
-    established.
-    For each of these states Suricata can employ different timeouts.
-    """
-    if "new" in state or "established" in state:
-        return "Established"
-    elif "closed" in state:
-        return "Not Established"
-
-
-def interpret_zeek_states(state) -> Optional[str]:
-    # We have varius type of states depending on the type of flow.
-    # For Zeek
-    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-        return "Not Established"
-    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-        return "Established"
-
-
-def interpret_argus_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
-    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-        """
-        Examples:
-        SA_SA
-        SR_SA
-        FSRA_SA
-        SPA_SPA
-        SRA_SPA
-        FSA_FSA
-        FSA_FSPA
-        SAEC_SPA
-        SRPA_SPA
-        FSPA_SPA
-        FSRPA_SPA
-        FSPA_FSPA
-        FSRA_FSPA
-        SRAEC_SPA
-        FSPA_FSRPA
-        FSAEC_FSPA
-        FSRPA_FSPA
-        SRPAEC_SPA
-        FSPAEC_FSPA
-        SRPAEC_FSRPA
-        """
-        return "Established"
-    elif "PA" in pre and "PA" in suf:
-        # Tipical flow that was reported in the middle
-        """
-        Examples:
-        PA_PA
-        FPA_FPA
-        """
-        return "Established"
-    elif "ECO" in pre:
-        return "ICMP Echo"
-    elif "ECR" in pre:
-        return "ICMP Reply"
-    elif "URH" in pre:
-        return "ICMP Host Unreachable"
-    elif "URP" in pre:
-        return "ICMP Port Unreachable"
-    else:
-        """
-        Examples:
-        S_RA
-        S_R
-        A_R
-        S_SA
-        SR_SA
-        FA_FA
-        SR_RA
-        SEC_RA
-        """
-        return "Not Established"
-
-
-def interpret_tcp_states(state, pkts) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "EST" in pre:
-        # TCP
-        return "Established"
-    elif "RST" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are reseted when finished and therefore are
-        # established
-        # It can happen that is reseted being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    elif "FIN" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are finished with FIN when finished and
-        # therefore are established
-        # It can happen that is finished being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    else:
-        """
-        Examples:
-        S_
-        FA_
-        PA_
-        FSA_
-        SEC_
-        SRPA_
-        """
-        return "Not Established"
-
-
-def interpret_udp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "CON" in pre:
-        # UDP
-        return "Established"
-    elif "INT" in pre:
-        # UDP trying to connect, NOT preciselly not established but also
-        # NOT 'Established'. So we considered not established because there
-        # is no confirmation of what happened.
-        return "Not Established"
-
-
-def interpret_icmp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "ECO" in pre:
-        # ICMP
-        return "Established"
-    elif "UNK" in pre:
-        # ICMP6 unknown upper layer
-        return "Established"
-
-
-def get_final_state_from_flags(state, pkts) -> str:
-    """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
-    """
-
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
-
-    return "Not Established"

From 62cf6cd7fd287ff669faa225e315eed8ef045b73 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:32:01 +0100
Subject: [PATCH 360/455] Flowmldetection. Fix missing db reference

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c06755a599..87e07c7592 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -160,7 +160,7 @@ def process_features(self, dataset):
             # 'Not Established', it is still 'S0' and others
             # So transform here
             dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
+                lambda row: self.db.get_final_state_from_flags(
                     row["state"], row["pkts"]
                 ),
                 axis=1,

From 4c8f42673eac97e521e16d94d3bbbe03138d3e4f Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:08 +0100
Subject: [PATCH 361/455] Fix the training of flows with ML in new version

---
 modules/flowmldetection/flowmldetection.py | 144 +++++++++++----------
 1 file changed, 77 insertions(+), 67 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 87e07c7592..e91495d649 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -55,8 +55,12 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new lables needed to trigger the train
-        self.minimum_lables_to_retrain = 50
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained
+        self.last_number_of_flows_when_trained = 0
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
@@ -67,26 +71,25 @@ def init(self):
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Process the labels to have only Normal and Malware
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*ormal.*$)", "Normal", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alware.*$)", "Malware", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alicious.*$)", "Malware", regex=True
-            )
+            # Get the flows from the DB
+            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
+            # Convert to pandas df
+            # self.flows = pd.DataFrame(self.flows)
+            # Process the features
+            # X_flow = self.process_features(self.flows)
 
-            # Separate
-            y_flow = self.flows["label"]
+            # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("label", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.label)
+            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -95,7 +98,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malware", "Normal"]
+                    X_flow, y_flow, classes=["Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -118,7 +121,7 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train()", 0, 1)
+            self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -144,9 +147,7 @@ def process_features(self, dataset):
                 "history",
                 "uid",
                 "dir_",
-                "dbytes",
                 "endtime",
-                "bytes",
                 "flow_source",
             ]
             for field in to_drop:
@@ -161,11 +162,10 @@ def process_features(self, dataset):
             # So transform here
             dataset["state"] = dataset.apply(
                 lambda row: self.db.get_final_state_from_flags(
-                    row["state"], row["pkts"]
+                    row["state"], (row["spkts"] + row["dpkts"])
                 ),
                 axis=1,
             )
-            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -199,7 +199,11 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            fields_to_convert_to_flow = [
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
                 dataset.proto,
                 dataset.dport,
                 dataset.sport,
@@ -210,10 +214,10 @@ def process_features(self, dataset):
                 dataset.sbytes,
                 dataset.state,
             ]
-            for field in fields_to_convert_to_flow:
+            for field in fields_to_convert_to_float:
                 try:
                     field = field.astype("float64")
-                except ValueError:
+                except (ValueError, AttributeError):
                     pass
 
             return dataset
@@ -222,9 +226,9 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flows(self):
+    def process_training_flows(self):
         """
-        Process all the flwos in the DB
+        Process all the flows in the DB
         Store the pandas df in self.flows
         """
         try:
@@ -240,44 +244,48 @@ def process_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
+
+                # These flows should be in the same format as the ones in the DB. 
+                # Which means the satate is still SF, S0, etc.
                 flows.append(
                     {
-                        "ts": 1594417039.029793,
+                        "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 42764,
-                        "spkts": 37,
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
                         "sbytes": 25517,
+                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malware",
+                        "label": "Malicious",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malware"
+                            "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "ts": 1382355032.706468,
+                        "starttime": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "state": "Established",
-                        "allbytes": 67696,
+                        "state": "SF",
                         "spkts": 1,
+                        "dpkts": 0,
                         "sbytes": 100,
+                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Normal",
+                        "label": "Benign",
                         "module_labels": {
-                            "flowalerts-long-connection": "Normal"
+                            "flowalerts-long-connection": "Benign"
                         },
                     }
                 )
@@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
         and returns the predection array
         """
         try:
-            given_x_flow = x_flow
             # clean the flow
             fields_to_drop = [
                 "label",
@@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "uid",
                 "history",
                 "dir_",
-                "dbytes",
-                "dpkts",
                 "endtime",
-                "bytes",
                 "flow_source",
                 "ground_truth_label",  # todo now we can use them
                 "detailed_ground_truth_label",
@@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             return pred
         except Exception as e:
             self.print(
-                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
             )
             self.print(traceback.format_exc(), 0, 1)
 
@@ -437,18 +441,16 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
+            # When a new flow arrives
             msg = json.loads(msg["data"])
-            twid = msg["twid"]
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
             self.flow = msg["flow"]
-            # these fields are expected in testing. update the original
+            # These following extra fields are expected in testing. update the original
             # flow dict to have them
             self.flow.update(
                 {
-                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
-                    # the flow["state"] is the origstate, we dont need that here
-                    # we need the interpreted state
                     "state": msg["interpreted_state"],
-                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
                     "label": msg["label"],
                     "module_labels": msg["module_labels"],
                 }
@@ -461,23 +463,31 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_lables_to_retrain
-                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain'
-                    # amount of labels
-                    # So for example we retrain every 100 labels and only when
-                    # we have at least 100 labels
-                    self.print(
-                        f"Training the model with the last group of "
-                        f"flows and labels. Total flows: {sum_labeled_flows}."
-                    )
-                    # Process all flows in the DB and make them ready
-                    # for pandas
-                    self.process_flows()
-                    # Train an algorithm
-                    self.train()
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows()
+                        # Train an algorithm
+                        self.train()
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
@@ -497,8 +507,8 @@ def main(self):
                         # and the label is diff from the prediction,
                         # print in debug mode
                         self.print(
-                            f"Report Prediction {pred[0]} for label"
-                            f' {label} flow {self.flow["saddr"]}:'
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} ->'
                             f' {self.flow["daddr"]}:'
                             f'{self.flow["dport"]}/'
@@ -506,9 +516,9 @@ def main(self):
                             0,
                             3,
                         )
-                    if pred[0] == "Malware":
+                    if pred[0] == "Malicious":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, twid)
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
                             f"Prediction {pred[0]} for label {label}"
                             f' flow {self.flow["saddr"]}:'

From 7a1e10fb8a2e19c8a158e05aa9c9fda0157cdbd6 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:29 +0100
Subject: [PATCH 362/455] Fix the profiler handler for cases of nan in state

---
 slips_files/core/database/redis_db/profile_handler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py
index a6669c92a9..ab53cc4ab0 100644
--- a/slips_files/core/database/redis_db/profile_handler.py
+++ b/slips_files/core/database/redis_db/profile_handler.py
@@ -423,6 +423,7 @@ def get_final_state_from_flags(self, state, pkts):
                     return "Established"
 
                 # For Argus
+                # In some flows the state is a nan
                 try:
                     suf = state.split("_")[1]
                 except AttributeError:

From c76c96344d42a17d3c3e5d51c868abe3896e5d76 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:22:38 +0100
Subject: [PATCH 363/455] flowml. If the dataset has one flow and that is
 deleted, then return empty fast.

---
 modules/flowmldetection/flowmldetection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e91495d649..58b4ce1e4c 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -135,6 +135,11 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
             # For now, discard these
             to_drop = [
                 "appproto",

From 74007e82690dbbd14787bd237f37e5507ca62b90 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:23:05 +0100
Subject: [PATCH 364/455] flowml. If the datasert is empty. Return none

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 58b4ce1e4c..4a4d46e376 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
             # Update the flow to the processed version
             return dflow
         except Exception:

From deefde05178f98f7b1ef9ee9c7b54c6b549b0f5b Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:27:16 +0100
Subject: [PATCH 365/455] First new version of the model and scaler. Not good
 yet, but working.

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1090 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644
GIT binary patch
delta 130
zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
kK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162Zxn3FXGFXLA@Pyhe`

delta 131
zcmV-}0DS+#2;>L^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8
z4q<hm<v;E**(1biC&1;CKfROVA3%9ur;SK<>_05T(v)g91VHfmFeIMvRKFpJJ~89v
lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl#

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK


From 8112079459716d801fa5e59ba60eeb86b79c9631 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:06 +0100
Subject: [PATCH 366/455] model and scaler with 1 malicious and 1 benign

---
 modules/flowmldetection/model.bin  | Bin 1090 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644
GIT binary patch
delta 132
zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSY<wU!LGn&}gUNx$3tLYnGl@A&e4S}J
zBSY*!uI-OaTr&&~Tw#uuJJhD?&}O=|yn5R|`<q--?Rc;A9e69=F=gK7d-lipbZtG?
j>K$N!g2|oC+8oRs(l<p`Iv<$4hdGq_%F%|&A}o3UHQY43

delta 131
zcmV-}0DS-C2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
lK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162cym;jSD1TGrII8guq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index bfba4d107224e5e6e5a1e8c8f4d463b48131d111..758909b289238ff282b2e056a9b3e83768b8472a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi%8H3jhEB

delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC


From af5bc46e9034d75d51472d09db456d58e7403e1f Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:27 +0100
Subject: [PATCH 367/455] cleaner jupyter

---
 modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4a4d46e376..d8e9ada27c 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "ground_truth_label",  # todo now we can use them
                 "detailed_ground_truth_label",
             ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)

From b558c05d455ee9651e29e7eef3d4045ad1241ade Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 22:26:27 +0100
Subject: [PATCH 368/455] New models after 3rd ttrain

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644
GIT binary patch
delta 99
zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~Cof{2$owSaz+^KPJplbX
BD~|vG

delta 99
zcmV-p0G$8i2;>N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q;
zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG
F1TN2UERg^J

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644
GIT binary patch
delta 43
zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G|lbQl%0g;md19^W6
B5HkP(

delta 43
zcmV+`0M!5b2KolDfdU!a4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G}lbQl%0g#gc19^VR
B5HSD%


From f8b36d65f2c942c3ed4abf5ce3a8cdbadcfd8ffb Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 00:08:50 +0100
Subject: [PATCH 369/455] Models after 4th train

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644
GIT binary patch
delta 120
zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
as{<<t5IC<!D(NIZlimYX0{noJHUuueR5UOE

delta 120
zcmaFD@q}YTFmr_2vYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~;rz*~nYB5XJEU)ltaLsw
P`5to^Q^<kIW-NLDlQb|V

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 821344a0c69d116622b02e2a0daa1554cb5d308e..29df65342047c5a499ee3f8e602d1f47cb7e9fca 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8kA
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi>ty4FCWD


From 4a448bc3b8ece80ad6b783d0809e6c93ad0c452e Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 08:28:59 +0100
Subject: [PATCH 370/455] Models of ml flow with the first good performance in
 small tests

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644
GIT binary patch
delta 121
zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1();
zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y<G(@KnDch(0mn<s!fm-ctwS<Lp0^
bs{<<tAstXx>?b6^limYW1McwvlQsk{8#y@u

delta 121
zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
bs{<<t5IC<!D(NIZlimYW1LXXGlQsk{_eeDq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 29df65342047c5a499ee3f8e602d1f47cb7e9fca..17115724b9536f6093f9d72f3b58a5c22c562a9a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK

delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T


From a2b5b9917a802f3810fa3c7b4719e69dfbb1b37c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 371/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 319 +++++++++++++--------
 1 file changed, 207 insertions(+), 112 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index d8e9ada27c..1fa77de01c 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import sys
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -55,12 +56,8 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new labels needed to start the train
-        self.minimum_labels_to_start_train = 50
-        # Minum amount of new labels needed to retrain
-        self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained
-        self.last_number_of_flows_when_trained = 0
+        # Minum amount of new lables needed to trigger the train
+        self.minimum_lables_to_retrain = 50
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
@@ -71,25 +68,26 @@ def init(self):
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
-        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Get the flows from the DB
-            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
-            # Convert to pandas df
-            # self.flows = pd.DataFrame(self.flows)
-            # Process the features
-            # X_flow = self.process_features(self.flows)
+            # Process the labels to have only Normal and Malware
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*ormal.*$)", "Normal", regex=True
+            )
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*alware.*$)", "Malware", regex=True
+            )
+            self.flows.label = self.flows.label.str.replace(
+                r"(^.*alicious.*$)", "Malware", regex=True
+            )
 
-            # Create X_flow with the current flows minus the label
+            # Separate
+            y_flow = self.flows["label"]
             X_flow = self.flows.drop("label", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.label)
-            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -98,7 +96,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malicious", "Benign"]
+                    X_flow, y_flow, classes=["Malware", "Normal"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -121,7 +119,142 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train().", 0, 1)
+            self.print("Error in train()", 0, 1)
+            self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -135,12 +268,7 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
-            if dataset.empty:
-                # DataFrame is empty now, so return empty
-                return dataset
-
-            # For now, discard these
+            # For now, discard the ports
             to_drop = [
                 "appproto",
                 "daddr",
@@ -152,7 +280,9 @@ def process_features(self, dataset):
                 "history",
                 "uid",
                 "dir_",
+                "dbytes",
                 "endtime",
+                "bytes",
                 "flow_source",
             ]
             for field in to_drop:
@@ -161,16 +291,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: self.db.get_final_state_from_flags(
-                    row["state"], (row["spkts"] + row["dpkts"])
-                ),
-                axis=1,
-            )
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -204,11 +330,7 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-
-            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
-            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
-
-            fields_to_convert_to_float = [
+            fields_to_convert_to_flow = [
                 dataset.proto,
                 dataset.dport,
                 dataset.sport,
@@ -219,10 +341,10 @@ def process_features(self, dataset):
                 dataset.sbytes,
                 dataset.state,
             ]
-            for field in fields_to_convert_to_float:
+            for field in fields_to_convert_to_flow:
                 try:
                     field = field.astype("float64")
-                except (ValueError, AttributeError):
+                except ValueError:
                     pass
 
             return dataset
@@ -231,9 +353,9 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_training_flows(self):
+    def process_flows(self):
         """
-        Process all the flows in the DB
+        Process all the flwos in the DB
         Store the pandas df in self.flows
         """
         try:
@@ -249,48 +371,44 @@ def process_training_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-
-                # These flows should be in the same format as the ones in the DB. 
-                # Which means the satate is still SF, S0, etc.
+                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
                 flows.append(
                     {
-                        "starttime": 1594417039.029793,
+                        "ts": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 17,
-                        "dpkts": 27,
+                        "state": "Established",
+                        "allbytes": 42764,
+                        "spkts": 37,
                         "sbytes": 25517,
-                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malicious",
+                        "label": "Malware",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
+                            "flowalerts-long-connection": "Malware"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "starttime": 1382355032.706468,
+                        "ts": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "state": "SF",
+                        "state": "Established",
+                        "allbytes": 67696,
                         "spkts": 1,
-                        "dpkts": 0,
                         "sbytes": 100,
-                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Benign",
+                        "label": "Normal",
                         "module_labels": {
-                            "flowalerts-long-connection": "Benign"
+                            "flowalerts-long-connection": "Normal"
                         },
                     }
                 )
@@ -318,8 +436,6 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
-            if dflow.empty:
-                return None
             # Update the flow to the processed version
             return dflow
         except Exception:
@@ -333,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
         and returns the predection array
         """
         try:
+            given_x_flow = x_flow
             # clean the flow
             fields_to_drop = [
                 "label",
@@ -340,28 +457,12 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "uid",
                 "history",
                 "dir_",
+                "dbytes",
+                "dpkts",
                 "endtime",
+                "bytes",
                 "flow_source",
-                "ground_truth_label",  # todo now we can use them
-                "detailed_ground_truth_label",
             ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
-            # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
-
-            # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)
@@ -373,7 +474,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             return pred
         except Exception as e:
             self.print(
-                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
             )
             self.print(traceback.format_exc(), 0, 1)
 
@@ -465,16 +566,18 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            # When a new flow arrives
             msg = json.loads(msg["data"])
-            self.twid = msg["twid"]
-            self.profileid = msg["profileid"]
+            twid = msg["twid"]
             self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
+            # these fields are expected in testing. update the original
             # flow dict to have them
             self.flow.update(
                 {
+                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
+                    # the flow["state"] is the origstate, we dont need that here
+                    # we need the interpreted state
                     "state": msg["interpreted_state"],
+                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
                     "label": msg["label"],
                     "module_labels": msg["module_labels"],
                 }
@@ -487,31 +590,23 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
-
-                # The min labels to retrain is the min number of flows 
-                # we should have seen so far in this capture to start training
-                # This is so we dont _start_ training with only 1 flow
-
-                # Once we are over the start minimum, the second condition is 
-                # to force to retrain every a minimum_labels_to_retrain number
-                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
+                    sum_labeled_flows >= self.minimum_lables_to_retrain
+                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
                 ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
-                        # So for example we retrain every 50 labels and only when
-                        # we have at least 50 labels
-                        self.print(
-                            f"Training the model with the last group of "
-                            f"flows and labels. Total flows: {sum_labeled_flows}."
-                        )
-                        # Process all flows in the DB and make them ready
-                        # for pandas
-                        self.process_training_flows()
-                        # Train an algorithm
-                        self.train()
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
-
+                    # We get here every 'self.minimum_lables_to_retrain'
+                    # amount of labels
+                    # So for example we retrain every 100 labels and only when
+                    # we have at least 100 labels
+                    self.print(
+                        f"Training the model with the last group of "
+                        f"flows and labels. Total flows: {sum_labeled_flows}."
+                    )
+                    # Process all flows in the DB and make them ready
+                    # for pandas
+                    self.process_flows()
+                    # Train an algorithm
+                    self.train()
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
@@ -531,8 +626,8 @@ def main(self):
                         # and the label is diff from the prediction,
                         # print in debug mode
                         self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f"Report Prediction {pred[0]} for label"
+                            f' {label} flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} ->'
                             f' {self.flow["daddr"]}:'
                             f'{self.flow["dport"]}/'
@@ -540,9 +635,9 @@ def main(self):
                             0,
                             3,
                         )
-                    if pred[0] == "Malicious":
+                    if pred[0] == "Malware":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, self.twid)
+                        self.set_evidence_malicious_flow(self.flow, twid)
                         self.print(
                             f"Prediction {pred[0]} for label {label}"
                             f' flow {self.flow["saddr"]}:'

From 5df2e70c0ea96004493eca3423768d6ab4347cab Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 372/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
new file mode 100644
index 0000000000..b671a09a28
--- /dev/null
+++ b/slips_files/common/state_handler.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import sys
+import traceback
+
+
+def check_suricata_states(state) -> Optional[str]:
+    """
+    There are different states in which a flow can be.
+    Suricata distinguishes three flow-states for TCP and two for
+     UDP. For TCP,
+    these are: New, Established and Closed,for UDP only new and
+    established.
+    For each of these states Suricata can employ different timeouts.
+    """
+    if "new" in state or "established" in state:
+        return "Established"
+    elif "closed" in state:
+        return "Not Established"
+
+
+def check_zeek_states(state) -> Optional[str]:
+    # We have varius type of states depending on the type of flow.
+    # For Zeek
+    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+        return "Not Established"
+    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+        return "Established"
+
+
+def check_argus_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    suf = state.split("_")[1]
+    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+        """
+        Examples:
+        SA_SA
+        SR_SA
+        FSRA_SA
+        SPA_SPA
+        SRA_SPA
+        FSA_FSA
+        FSA_FSPA
+        SAEC_SPA
+        SRPA_SPA
+        FSPA_SPA
+        FSRPA_SPA
+        FSPA_FSPA
+        FSRA_FSPA
+        SRAEC_SPA
+        FSPA_FSRPA
+        FSAEC_FSPA
+        FSRPA_FSPA
+        SRPAEC_SPA
+        FSPAEC_FSPA
+        SRPAEC_FSRPA
+        """
+        return "Established"
+    elif "PA" in pre and "PA" in suf:
+        # Tipical flow that was reported in the middle
+        """
+        Examples:
+        PA_PA
+        FPA_FPA
+        """
+        return "Established"
+    elif "ECO" in pre:
+        return "ICMP Echo"
+    elif "ECR" in pre:
+        return "ICMP Reply"
+    elif "URH" in pre:
+        return "ICMP Host Unreachable"
+    elif "URP" in pre:
+        return "ICMP Port Unreachable"
+    else:
+        """
+        Examples:
+        S_RA
+        S_R
+        A_R
+        S_SA
+        SR_SA
+        FA_FA
+        SR_RA
+        SEC_RA
+        """
+        return "Not Established"
+
+
+def check_tcp_states(state, pkts) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "EST" in pre:
+        # TCP
+        return "Established"
+    elif "RST" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are reseted when finished and therefore are
+        # established
+        # It can happen that is reseted being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    elif "FIN" in pre:
+        # TCP. When -z B is not used in argus, states are single words.
+        # Most connections are finished with FIN when finished and
+        # therefore are established
+        # It can happen that is finished being not established, but we
+        # can't tell without -z b.
+        # So we use as heuristic the amount of packets. If <=3, then is
+        # not established because the OS retries 3 times.
+        return "Not Established" if int(pkts) <= 3 else "Established"
+    else:
+        """
+        Examples:
+        S_
+        FA_
+        PA_
+        FSA_
+        SEC_
+        SRPA_
+        """
+        return "Not Established"
+
+
+def check_udp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "CON" in pre:
+        # UDP
+        return "Established"
+    elif "INT" in pre:
+        # UDP trying to connect, NOT preciselly not established but also
+        # NOT 'Established'. So we considered not established because there
+        # is no confirmation of what happened.
+        return "Not Established"
+
+
+def check_icmp_states(state) -> Optional[str]:
+    pre = state.split("_")[0]
+    if "ECO" in pre:
+        # ICMP
+        return "Established"
+    elif "UNK" in pre:
+        # ICMP6 unknown upper layer
+        return "Established"
+
+
+def get_final_state_from_flags(self, state, pkts) -> str:
+    """
+    Analyze the flags given and return a summary of the state.
+    Should work with Argus and Bro flags
+    We receive the pakets to distinguish some Reset connections
+    """
+    try:
+        if state := check_suricata_states(state):
+            return state
+        if state := check_zeek_states(state):
+            return state
+        if state := check_argus_states(state):
+            return state
+    except IndexError:
+        # suf does not exist, which means that this is some ICMP or
+        # no response was sent for UDP or TCP
+        if state := check_icmp_states(state):
+            return state
+        if state := check_udp_states(state):
+            return state
+        if state := check_tcp_states(state, pkts):
+            return state
+
+        return "Not Established"
+
+    except Exception:
+        exception_line = sys.exc_info()[2].tb_lineno
+        self.print(
+            f"Error in get_final_state_from_flags() " f"line {exception_line}",
+            0,
+            1,
+        )
+        self.print(traceback.format_exc(), 0, 1)

From 92316cf2520fa980dcc14d808a1393e7e0968eb5 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 373/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 67 +++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index b671a09a28..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,9 +1,7 @@
 from typing import Optional
-import sys
-import traceback
 
 
-def check_suricata_states(state) -> Optional[str]:
+def interpret_suricata_states(state) -> Optional[str]:
     """
     There are different states in which a flow can be.
     Suricata distinguishes three flow-states for TCP and two for
@@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_zeek_states(state) -> Optional[str]:
+def interpret_zeek_states(state) -> Optional[str]:
     # We have varius type of states depending on the type of flow.
     # For Zeek
     if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
@@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]:
         return "Established"
 
 
-def check_argus_states(state) -> Optional[str]:
+def interpret_argus_states(state) -> Optional[str]:
     pre = state.split("_")[0]
-    suf = state.split("_")[1]
+    try:
+        suf = state.split("_")[1]
+    except IndexError:
+        return
+
     if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
         """
         Examples:
@@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_tcp_states(state, pkts) -> Optional[str]:
+def interpret_tcp_states(state, pkts) -> Optional[str]:
     pre = state.split("_")[0]
     if "EST" in pre:
         # TCP
@@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]:
         return "Not Established"
 
 
-def check_udp_states(state) -> Optional[str]:
+def interpret_udp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "CON" in pre:
         # UDP
@@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]:
         return "Not Established"
 
 
-def check_icmp_states(state) -> Optional[str]:
+def interpret_icmp_states(state) -> Optional[str]:
     pre = state.split("_")[0]
     if "ECO" in pre:
         # ICMP
@@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]:
         return "Established"
 
 
-def get_final_state_from_flags(self, state, pkts) -> str:
+def get_final_state_from_flags(state, pkts) -> str:
     """
-    Analyze the flags given and return a summary of the state.
-    Should work with Argus and Bro flags
-    We receive the pakets to distinguish some Reset connections
+    Converts the original flags from the flow, to a state that slips
+    understands
+    Works with Argus, suricata, and Bro flags
+    We receive the packets to distinguish some Reset connections
     """
-    try:
-        if state := check_suricata_states(state):
-            return state
-        if state := check_zeek_states(state):
-            return state
-        if state := check_argus_states(state):
-            return state
-    except IndexError:
-        # suf does not exist, which means that this is some ICMP or
-        # no response was sent for UDP or TCP
-        if state := check_icmp_states(state):
-            return state
-        if state := check_udp_states(state):
-            return state
-        if state := check_tcp_states(state, pkts):
-            return state
 
-        return "Not Established"
+    for interpreter in (
+        interpret_suricata_states,
+        interpret_zeek_states,
+        interpret_argus_states,
+        interpret_icmp_states,
+        interpret_udp_states,
+    ):
+        if interpreted_state := interpreter(state):
+            return interpreted_state
+
+    if interpreted_state := interpret_tcp_states(state, pkts):
+        return interpreted_state
 
-    except Exception:
-        exception_line = sys.exc_info()[2].tb_lineno
-        self.print(
-            f"Error in get_final_state_from_flags() " f"line {exception_line}",
-            0,
-            1,
-        )
-        self.print(traceback.format_exc(), 0, 1)
+    return "Not Established"

From eb778265b8d6f98c27489081a478a2b0ae744da0 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 374/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py    | 150 ++----------------
 slips_files/core/database/database_manager.py |   3 -
 2 files changed, 10 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 1fa77de01c..0e7c4b78e5 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -291,12 +156,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 1d339685f8..568e78ff45 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
-    def get_final_state_from_flags(self, *args, **kwargs):
-        return self.rdb.get_final_state_from_flags(*args, **kwargs)
-
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 28d2199e094edbaab33620c6cd8c56252d67c0be Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 375/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 0e7c4b78e5..19e829e11b 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From cbe80f8e80d05d147a1e54544f01ee4b2ab18cab Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 376/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 19e829e11b..0e7c4b78e5 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From aa68a909bb8309e70b15ca70958076a368dbe0c7 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 377/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 0e7c4b78e5..19e829e11b 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From aee1e13912d8bf414b5f924e6684187b7c114a68 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 378/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 19e829e11b..0e7c4b78e5 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From fc14125fe16615de2e29e40fc98e215bd4648bbd Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 379/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 0e7c4b78e5..19e829e11b 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,6 +121,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 9c95c76b54f429f9eaf2c8035d60b98f5bf8dffe Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 380/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 19e829e11b..0e7c4b78e5 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -121,141 +121,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From 1b20f2ab937725762ca307dee70a3cb517d8d579 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 381/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 165 +++++++++++++++++++--
 1 file changed, 150 insertions(+), 15 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 0e7c4b78e5..c8226368c7 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -8,6 +8,7 @@
 import pickle
 import pandas as pd
 import json
+import datetime
 import traceback
 import warnings
 import sys
@@ -121,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -133,7 +269,7 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # For now, discard the ports
+            # For now, discard these
             to_drop = [
                 "appproto",
                 "daddr",
@@ -156,17 +292,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
-                    row["state"], row["pkts"]
-                ),
-                axis=1,
-            )
-            # dataset.state = new_state_column
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -393,21 +524,25 @@ def read_model(self):
     def set_evidence_malicious_flow(self, flow: dict, twid: str):
         confidence: float = 0.1
         description = (
-            f"Flow with malicious characteristics by ML. Src IP"
+            f"Malicious flow by ML. Src IP"
             f" {flow['saddr']}:{flow['sport']} to "
             f"{flow['daddr']}:{flow['dport']}"
         )
+
+        timestamp = utils.convert_format(
+            datetime.datetime.now(), utils.alerts_format
+        )
         twid_number = int(twid.replace("timewindow", ""))
         evidence: Evidence = Evidence(
             evidence_type=EvidenceType.MALICIOUS_FLOW,
             attacker=Attacker(
                 direction=Direction.SRC,
-                ioc_type=IoCType.IP,
+                attacker_type=IoCType.IP,
                 value=flow["saddr"],
             ),
             victim=Victim(
                 direction=Direction.DST,
-                ioc_type=IoCType.IP,
+                victim_type=IoCType.IP,
                 value=flow["daddr"],
             ),
             threat_level=ThreatLevel.LOW,
@@ -416,7 +551,7 @@ def set_evidence_malicious_flow(self, flow: dict, twid: str):
             profile=ProfileID(ip=flow["saddr"]),
             timewindow=TimeWindow(twid_number),
             uid=[flow["uid"]],
-            timestamp=flow["starttime"],
+            timestamp=timestamp,
             method=Method.AI,
             src_port=flow["sport"],
             dst_port=flow["dport"],

From 2b9ed84a6a2bdbe9a2ec8a109da92df4d627b994 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:35:21 +0300
Subject: [PATCH 382/455] state_handler: split get_final_state_from_flags()
 into smaller functions

---
 slips_files/common/state_handler.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index d0a05115bd..43d9b5461e 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,4 +1,6 @@
 from typing import Optional
+import sys
+import traceback
 
 
 def interpret_suricata_states(state) -> Optional[str]:

From 736cf0b76411e510c34b586f644895cbf9250e75 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:16 +0300
Subject: [PATCH 383/455] state_handler: refactor get_final_state_from_flags()

---
 slips_files/common/state_handler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
index 43d9b5461e..d0a05115bd 100644
--- a/slips_files/common/state_handler.py
+++ b/slips_files/common/state_handler.py
@@ -1,6 +1,4 @@
 from typing import Optional
-import sys
-import traceback
 
 
 def interpret_suricata_states(state) -> Optional[str]:

From 2b576c42258e49f2bdcc008964e04e35b7aeb972 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 384/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 150 ++-------------------
 1 file changed, 10 insertions(+), 140 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c8226368c7..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -292,12 +157,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(

From 47d05a060ed6f78fb47892d9756998e775e05b94 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 385/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..94eb27afdf 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From e197df04e3e44f4318289706ede7a3483ec7feb2 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 386/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 135 ---------------------
 1 file changed, 135 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 94eb27afdf..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """

From d95f4c938e6fdf0ca5bf7ccd607cfb71e2a34c34 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 387/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 150 +++++++++++++++++++--
 1 file changed, 140 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..c8226368c7 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,6 +122,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -157,17 +292,12 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
-                    row["state"], row["pkts"]
-                ),
-                axis=1,
-            )
-            # dataset.state = new_state_column
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(

From c9d2395cd1bfd3f19b1ec80bbde1a6b322e866f5 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 388/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 150 ++-------------------
 1 file changed, 10 insertions(+), 140 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c8226368c7..9af514a709 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -122,141 +122,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -292,12 +157,17 @@ def process_features(self, dataset):
                 except (ValueError, KeyError):
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(

From f6de6fe7db854dcd9ee932e602b7d15af93f80cd Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:32:14 +0200
Subject: [PATCH 389/455] mlflow. Add a function to convert the state again

---
 modules/flowmldetection/flowmldetection.py | 438 +++++++++++++--------
 1 file changed, 278 insertions(+), 160 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9af514a709..124ec61f91 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,8 +1,3 @@
-# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
-from typing import Optional
-
-# SPDX-License-Identifier: GPL-2.0-only
-import numpy
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler
 import pickle
@@ -10,13 +5,10 @@
 import json
 import datetime
 import traceback
-import warnings
 import sys
 
-from slips_files.common.parsers.config_parser import ConfigParser
-from slips_files.common.slips_utils import utils
-from slips_files.common.abstracts.module import IModule
-from slips_files.core.structures.evidence import (
+from slips_files.common.imports import *
+from slips_files.core.evidence_structure.evidence import (
     Evidence,
     ProfileID,
     TimeWindow,
@@ -25,8 +17,7 @@
     EvidenceType,
     IoCType,
     Direction,
-    Victim,
-    Method,
+    IDEACategory,
 )
 
 # Only for debbuging
@@ -38,6 +29,8 @@ def warn(*args, **kwargs):
     pass
 
 
+import warnings
+
 warnings.warn = warn
 
 
@@ -63,8 +56,6 @@ def init(self):
         # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
-        self.model_path = "./modules/flowmldetection/model.bin"
-        self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
     def read_configuration(self):
         conf = ConfigParser()
@@ -122,6 +113,141 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+    
+    def get_final_state_from_flags(self, state, pkts):
+        """
+        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
+        We receive the pakets to distinguish some Reset connections
+        """
+        try:
+            pre = state.split("_")[0]
+            try:
+                # Try suricata states
+                """
+                There are different states in which a flow can be.
+                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
+                these are: New, Established and Closed,for UDP only new and established.
+                For each of these states Suricata can employ different timeouts.
+                """
+                if "new" in state or "established" in state:
+                    return "Established"
+                elif "closed" in state:
+                    return "Not Established"
+
+                # We have varius type of states depending on the type of flow.
+                # For Zeek
+                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
+                    return "Not Established"
+                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
+                    return "Established"
+
+                # For Argus
+                suf = state.split("_")[1]
+                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
+                    """
+                    Examples:
+                    SA_SA
+                    SR_SA
+                    FSRA_SA
+                    SPA_SPA
+                    SRA_SPA
+                    FSA_FSA
+                    FSA_FSPA
+                    SAEC_SPA
+                    SRPA_SPA
+                    FSPA_SPA
+                    FSRPA_SPA
+                    FSPA_FSPA
+                    FSRA_FSPA
+                    SRAEC_SPA
+                    FSPA_FSRPA
+                    FSAEC_FSPA
+                    FSRPA_FSPA
+                    SRPAEC_SPA
+                    FSPAEC_FSPA
+                    SRPAEC_FSRPA
+                    """
+                    return "Established"
+                elif "PA" in pre and "PA" in suf:
+                    # Tipical flow that was reported in the middle
+                    """
+                    Examples:
+                    PA_PA
+                    FPA_FPA
+                    """
+                    return "Established"
+                elif "ECO" in pre:
+                    return "ICMP Echo"
+                elif "ECR" in pre:
+                    return "ICMP Reply"
+                elif "URH" in pre:
+                    return "ICMP Host Unreachable"
+                elif "URP" in pre:
+                    return "ICMP Port Unreachable"
+                else:
+                    """
+                    Examples:
+                    S_RA
+                    S_R
+                    A_R
+                    S_SA
+                    SR_SA
+                    FA_FA
+                    SR_RA
+                    SEC_RA
+                    """
+                    return "Not Established"
+            except IndexError:
+                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
+                if "ECO" in pre:
+                    # ICMP
+                    return "Established"
+                elif "UNK" in pre:
+                    # ICMP6 unknown upper layer
+                    return "Established"
+                elif "CON" in pre:
+                    # UDP
+                    return "Established"
+                elif "INT" in pre:
+                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
+                    # is no confirmation of what happened.
+                    return "Not Established"
+                elif "EST" in pre:
+                    # TCP
+                    return "Established"
+                elif "RST" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
+                    # It can happen that is reseted being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                elif "FIN" in pre:
+                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
+                    # It can happen that is finished being not established, but we can't tell without -z b.
+                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
+                    return (
+                        "Not Established" if int(pkts) <= 3 else "Established"
+                    )
+                else:
+                    """
+                    Examples:
+                    S_
+                    FA_
+                    PA_
+                    FSA_
+                    SEC_
+                    SRPA_
+                    """
+                    return "Not Established"
+        except Exception:
+            exception_line = sys.exc_info()[2].tb_lineno
+            self.print(
+                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
+                0,
+                1,
+            )
+            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -130,7 +256,7 @@ def process_features(self, dataset):
         """
         try:
             # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"]
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
@@ -139,35 +265,28 @@ def process_features(self, dataset):
                 "appproto",
                 "daddr",
                 "saddr",
-                "starttime",
+                "ts",
+                "origstate",
                 "type_",
-                "smac",
-                "dmac",
-                "history",
-                "uid",
                 "dir_",
+                "history",
                 "dbytes",
-                "endtime",
-                "bytes",
-                "flow_source",
+                "dpkts",
+                "smac",
+                "dmac",
             ]
             for field in to_drop:
                 try:
                     dataset = dataset.drop(field, axis=1)
-                except (ValueError, KeyError):
+                except ValueError:
                     pass
 
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
             # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
-                    row["state"], row["pkts"]
-                ),
-                axis=1,
-            )
-            # dataset.state = new_state_column
+            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
+            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
+
+            #dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -201,23 +320,42 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            fields_to_convert_to_flow = [
-                dataset.proto,
-                dataset.dport,
-                dataset.sport,
-                dataset.dur,
-                dataset.pkts,
-                dataset.spkts,
-                dataset.allbytes,
-                dataset.sbytes,
-                dataset.state,
-            ]
-            for field in fields_to_convert_to_flow:
-                try:
-                    field = field.astype("float64")
-                except ValueError:
-                    pass
-
+            dataset.proto = dataset.proto.astype("float64")
+            try:
+                # Convert dport to float
+                dataset.dport = dataset.dport.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert sport to float
+                dataset.sport = dataset.sport.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert Dur to float
+                dataset.dur = dataset.dur.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert TotPkts to float
+                dataset.pkts = dataset.pkts.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert SrcPkts to float
+                dataset.spkts = dataset.spkts.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert TotBytes to float
+                dataset.allbytes = dataset.allbytes.astype("float")
+            except ValueError:
+                pass
+            try:
+                # Convert SrcBytes to float
+                dataset.sbytes = dataset.sbytes.astype("float")
+            except ValueError:
+                pass
             return dataset
         except Exception:
             # Stop the timer
@@ -233,6 +371,7 @@ def process_flows(self):
             # We get all the flows so far
             # because this retraining happens in batches
             flows = self.db.get_all_flows()
+
             # Check how many different labels are in the DB
             # We need both normal and malware
             labels = self.db.get_labels()
@@ -252,7 +391,9 @@ def process_flows(self):
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
+                        "origstate": "SRPA_SPA",
                         "state": "Established",
+                        "pkts": 84,
                         "allbytes": 42764,
                         "spkts": 37,
                         "sbytes": 25517,
@@ -272,7 +413,9 @@ def process_flows(self):
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
+                        "origstate": "SRPA_SPA",
                         "state": "Established",
+                        "pkts": 67,
                         "allbytes": 67696,
                         "spkts": 1,
                         "sbytes": 100,
@@ -298,55 +441,42 @@ def process_flows(self):
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flow(self, flow_to_process: dict):
+    def process_flow(self):
         """
         Process one flow. Only used during detection in testing
-        returns the pandas df with the processed flow
+        Store the pandas df in self.flow
         """
         try:
             # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(flow_to_process, index=[0])
+            raw_flow = pd.DataFrame(self.flow_dict, index=[0])
+            # Process features
             dflow = self.process_features(raw_flow)
             # Update the flow to the processed version
-            return dflow
+            self.flow = dflow
         except Exception:
             # Stop the timer
             self.print("Error in process_flow()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def detect(self, x_flow) -> Optional[numpy.ndarray]:
+    def detect(self):
         """
-        Detects the given flow with the current model stored
-        and returns the predection array
+        Detect this flow with the current model stored
         """
         try:
-            given_x_flow = x_flow
-            # clean the flow
-            fields_to_drop = [
-                "label",
-                "module_labels",
-                "uid",
-                "history",
-                "dir_",
-                "dbytes",
-                "dpkts",
-                "endtime",
-                "bytes",
-                "flow_source",
-            ]
-            for field in fields_to_drop:
-                try:
-                    x_flow = x_flow.drop(field, axis=1)
-                except (KeyError, ValueError):
-                    pass
+            # Store the real label if there is one
+            y_flow = self.flow["label"]
+            # remove the real label column
+            self.flow = self.flow.drop("label", axis=1)
+            # remove the label predictions column of the other modules
+            X_flow = self.flow.drop("module_labels", axis=1)
             # Scale the flow
-            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
-            pred: numpy.ndarray = self.clf.predict(x_flow)
+            X_flow = self.scaler.transform(X_flow)
+            pred = self.clf.predict(X_flow)
             return pred
-        except Exception as e:
-            self.print(
-                f"Error in detect() while processing " f"\n{given_x_flow}\n{e}"
-            )
+        except Exception:
+            # Stop the timer
+            self.print("Error in detect() X_flow:")
+            self.print(X_flow)
             self.print(traceback.format_exc(), 0, 1)
 
     def store_model(self):
@@ -354,10 +484,10 @@ def store_model(self):
         Store the trained model on disk
         """
         self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open(self.model_path, "wb") as f:
+        with open("./modules/flowmldetection/model.bin", "wb") as f:
             data = pickle.dumps(self.clf)
             f.write(data)
-        with open(self.scaler_path, "wb") as g:
+        with open("./modules/flowmldetection/scaler.bin", "wb") as g:
             data = pickle.dumps(self.scaler)
             g.write(data)
 
@@ -367,23 +497,20 @@ def read_model(self):
         """
         try:
             self.print("Reading the trained model from disk.", 0, 2)
-            with open(self.model_path, "rb") as f:
+            with open("./modules/flowmldetection/model.bin", "rb") as f:
                 self.clf = pickle.load(f)
             self.print("Reading the trained scaler from disk.", 0, 2)
-            with open(self.scaler_path, "rb") as g:
+            with open("./modules/flowmldetection/scaler.bin", "rb") as g:
                 self.scaler = pickle.load(g)
         except FileNotFoundError:
             # If there is no model, create one empty
-            self.print(
-                "There was no model. " "Creating a new empty model.", 0, 2
-            )
+            self.print("There was no model. Creating a new empty model.", 0, 2)
             self.clf = SGDClassifier(
                 warm_start=True, loss="hinge", penalty="l1"
             )
         except EOFError:
             self.print(
-                "Error reading model from disk. "
-                "Creating a new empty model.",
+                "Error reading model from disk. Creating a new empty model.",
                 0,
                 2,
             )
@@ -391,40 +518,39 @@ def read_model(self):
                 warm_start=True, loss="hinge", penalty="l1"
             )
 
-    def set_evidence_malicious_flow(self, flow: dict, twid: str):
+    def set_evidence_malicious_flow(
+        self,
+        saddr: str,
+        sport: str,
+        daddr: str,
+        dport: str,
+        twid: str,
+        uid: str,
+    ):
         confidence: float = 0.1
+        ip_identification = self.db.get_ip_identification(daddr)
         description = (
-            f"Malicious flow by ML. Src IP"
-            f" {flow['saddr']}:{flow['sport']} to "
-            f"{flow['daddr']}:{flow['dport']}"
+            f"Malicious flow by ML. Src IP {saddr}:{sport} to "
+            f"{daddr}:{dport} {ip_identification}"
         )
 
         timestamp = utils.convert_format(
             datetime.datetime.now(), utils.alerts_format
         )
-        twid_number = int(twid.replace("timewindow", ""))
+
         evidence: Evidence = Evidence(
             evidence_type=EvidenceType.MALICIOUS_FLOW,
             attacker=Attacker(
-                direction=Direction.SRC,
-                attacker_type=IoCType.IP,
-                value=flow["saddr"],
-            ),
-            victim=Victim(
-                direction=Direction.DST,
-                victim_type=IoCType.IP,
-                value=flow["daddr"],
+                direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr
             ),
             threat_level=ThreatLevel.LOW,
             confidence=confidence,
             description=description,
-            profile=ProfileID(ip=flow["saddr"]),
-            timewindow=TimeWindow(twid_number),
-            uid=[flow["uid"]],
+            profile=ProfileID(ip=saddr),
+            timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))),
+            uid=[uid],
             timestamp=timestamp,
-            method=Method.AI,
-            src_port=flow["sport"],
-            dst_port=flow["dport"],
+            category=IDEACategory.ANOMALY_TRAFFIC,
         )
 
         self.db.set_evidence(evidence)
@@ -441,22 +567,20 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            msg = json.loads(msg["data"])
-            twid = msg["twid"]
-            self.flow = msg["flow"]
-            # these fields are expected in testing. update the original
-            # flow dict to have them
-            self.flow.update(
-                {
-                    "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]),
-                    # the flow["state"] is the origstate, we dont need that here
-                    # we need the interpreted state
-                    "state": msg["interpreted_state"],
-                    "pkts": self.flow["spkts"] + self.flow["dpkts"],
-                    "label": msg["label"],
-                    "module_labels": msg["module_labels"],
-                }
-            )
+            data = msg["data"]
+            # Convert from json to dict
+            data = json.loads(data)
+            profileid = data["profileid"]
+            twid = data["twid"]
+            # Get flow that is now in json format
+            flow = data["flow"]
+            # Convert flow to a dict
+            flow = json.loads(flow)
+            # Convert the common fields to something that can
+            # be interpreted
+            # Get the uid which is the key
+            uid = next(iter(flow))
+            self.flow_dict = json.loads(flow[uid])
 
             if self.mode == "train":
                 # We are training
@@ -469,57 +593,51 @@ def main(self):
                     sum_labeled_flows >= self.minimum_lables_to_retrain
                     and sum_labeled_flows % self.minimum_lables_to_retrain == 1
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain'
-                    # amount of labels
-                    # So for example we retrain every 100 labels and only when
-                    # we have at least 100 labels
+                    # We get here every 'self.minimum_lables_to_retrain' amount of labels
+                    # So for example we retrain every 100 labels and only when we have at least 100 labels
                     self.print(
-                        f"Training the model with the last group of "
-                        f"flows and labels. Total flows: {sum_labeled_flows}."
+                        f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}."
                     )
-                    # Process all flows in the DB and make them ready
-                    # for pandas
+                    # Process all flows in the DB and make them ready for pandas
                     self.process_flows()
                     # Train an algorithm
                     self.train()
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
-                processed_flow = self.process_flow(self.flow)
+                self.process_flow()
 
-                # After processing the flow, it may happen that we
-                # delete icmp/arp/etc so the dataframe can be empty
-                if processed_flow is not None and not processed_flow.empty:
+                # After processing the flow, it may happen that we delete icmp/arp/etc
+                # so the dataframe can be empty
+                if self.flow is not None and not self.flow.empty:
                     # Predict
-                    pred: numpy.ndarray = self.detect(processed_flow)
-                    if not pred:
-                        # an error occurred
-                        return
+                    pred = self.detect()
+                    label = self.flow_dict["label"]
 
-                    label = self.flow["label"]
+                    # Report
                     if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
+                        # If the user specified a label in test mode, and the label
+                        # is diff from the prediction, print in debug mode
                         self.print(
-                            f"Report Prediction {pred[0]} for label"
-                            f' {label} flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
+                            f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
+                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
+                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
                             0,
                             3,
                         )
                     if pred[0] == "Malware":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, twid)
+                        self.set_evidence_malicious_flow(
+                            self.flow_dict["saddr"],
+                            self.flow_dict["sport"],
+                            self.flow_dict["daddr"],
+                            self.flow_dict["dport"],
+                            twid,
+                            uid,
+                        )
                         self.print(
-                            f"Prediction {pred[0]} for label {label}"
-                            f' flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} -> '
-                            f'{self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
+                            f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
+                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
+                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
                             0,
                             2,
                         )

From 1b46d82aa527373f28ad89932d12fbf7775a8561 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Tue, 30 Jul 2024 14:59:22 +0300
Subject: [PATCH 390/455] delete get_final_state_from_flags() from
 flowmldetection, profiler, and the db

---
 modules/flowmldetection/flowmldetection.py | 169 +++------------------
 1 file changed, 19 insertions(+), 150 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 124ec61f91..c57a7a3581 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -5,9 +5,13 @@
 import json
 import datetime
 import traceback
-import sys
+import warnings
+
 
-from slips_files.common.imports import *
+from slips_files.common.state_handler import get_final_state_from_flags
+from slips_files.common.parsers.config_parser import ConfigParser
+from slips_files.common.slips_utils import utils
+from slips_files.common.abstracts.module import IModule
 from slips_files.core.evidence_structure.evidence import (
     Evidence,
     ProfileID,
@@ -29,8 +33,6 @@ def warn(*args, **kwargs):
     pass
 
 
-import warnings
-
 warnings.warn = warn
 
 
@@ -113,141 +115,6 @@ def train(self):
         except Exception:
             self.print("Error in train()", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-    
-    def get_final_state_from_flags(self, state, pkts):
-        """
-        Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags
-        We receive the pakets to distinguish some Reset connections
-        """
-        try:
-            pre = state.split("_")[0]
-            try:
-                # Try suricata states
-                """
-                There are different states in which a flow can be.
-                Suricata distinguishes three flow-states for TCP and two for UDP. For TCP,
-                these are: New, Established and Closed,for UDP only new and established.
-                For each of these states Suricata can employ different timeouts.
-                """
-                if "new" in state or "established" in state:
-                    return "Established"
-                elif "closed" in state:
-                    return "Not Established"
-
-                # We have varius type of states depending on the type of flow.
-                # For Zeek
-                if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-                    return "Not Established"
-                elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-                    return "Established"
-
-                # For Argus
-                suf = state.split("_")[1]
-                if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-                    """
-                    Examples:
-                    SA_SA
-                    SR_SA
-                    FSRA_SA
-                    SPA_SPA
-                    SRA_SPA
-                    FSA_FSA
-                    FSA_FSPA
-                    SAEC_SPA
-                    SRPA_SPA
-                    FSPA_SPA
-                    FSRPA_SPA
-                    FSPA_FSPA
-                    FSRA_FSPA
-                    SRAEC_SPA
-                    FSPA_FSRPA
-                    FSAEC_FSPA
-                    FSRPA_FSPA
-                    SRPAEC_SPA
-                    FSPAEC_FSPA
-                    SRPAEC_FSRPA
-                    """
-                    return "Established"
-                elif "PA" in pre and "PA" in suf:
-                    # Tipical flow that was reported in the middle
-                    """
-                    Examples:
-                    PA_PA
-                    FPA_FPA
-                    """
-                    return "Established"
-                elif "ECO" in pre:
-                    return "ICMP Echo"
-                elif "ECR" in pre:
-                    return "ICMP Reply"
-                elif "URH" in pre:
-                    return "ICMP Host Unreachable"
-                elif "URP" in pre:
-                    return "ICMP Port Unreachable"
-                else:
-                    """
-                    Examples:
-                    S_RA
-                    S_R
-                    A_R
-                    S_SA
-                    SR_SA
-                    FA_FA
-                    SR_RA
-                    SEC_RA
-                    """
-                    return "Not Established"
-            except IndexError:
-                # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP
-                if "ECO" in pre:
-                    # ICMP
-                    return "Established"
-                elif "UNK" in pre:
-                    # ICMP6 unknown upper layer
-                    return "Established"
-                elif "CON" in pre:
-                    # UDP
-                    return "Established"
-                elif "INT" in pre:
-                    # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there
-                    # is no confirmation of what happened.
-                    return "Not Established"
-                elif "EST" in pre:
-                    # TCP
-                    return "Established"
-                elif "RST" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established
-                    # It can happen that is reseted being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                elif "FIN" in pre:
-                    # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established
-                    # It can happen that is finished being not established, but we can't tell without -z b.
-                    # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times.
-                    return (
-                        "Not Established" if int(pkts) <= 3 else "Established"
-                    )
-                else:
-                    """
-                    Examples:
-                    S_
-                    FA_
-                    PA_
-                    FSA_
-                    SEC_
-                    SRPA_
-                    """
-                    return "Not Established"
-        except Exception:
-            exception_line = sys.exc_info()[2].tb_lineno
-            self.print(
-                f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}",
-                0,
-                1,
-            )
-            self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
         """
@@ -281,12 +148,17 @@ def process_features(self, dataset):
                 except ValueError:
                     pass
 
-            # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
             # So transform here
-            #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts))
-            dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1)
-
-            #dataset.state = new_state_column
+            dataset["state"] = dataset.apply(
+                lambda row: get_final_state_from_flags(
+                    row["state"], row["pkts"]
+                ),
+                axis=1,
+            )
+            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -370,7 +242,7 @@ def process_flows(self):
         try:
             # We get all the flows so far
             # because this retraining happens in batches
-            flows = self.db.get_all_flows()
+            flows: list = self.db.get_all_flows()
 
             # Check how many different labels are in the DB
             # We need both normal and malware
@@ -464,7 +336,7 @@ def detect(self):
         """
         try:
             # Store the real label if there is one
-            y_flow = self.flow["label"]
+            # y_flow = self.flow["label"]
             # remove the real label column
             self.flow = self.flow.drop("label", axis=1)
             # remove the label predictions column of the other modules
@@ -568,13 +440,10 @@ def pre_main(self):
     def main(self):
         if msg := self.get_msg("new_flow"):
             data = msg["data"]
-            # Convert from json to dict
             data = json.loads(data)
-            profileid = data["profileid"]
+            # profileid = data["profileid"]
             twid = data["twid"]
-            # Get flow that is now in json format
             flow = data["flow"]
-            # Convert flow to a dict
             flow = json.loads(flow)
             # Convert the common fields to something that can
             # be interpreted

From 299d2ab8fd04e70a3a7b4f9bc287a3a642faf542 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Mon, 29 Jul 2024 16:36:55 +0200
Subject: [PATCH 391/455] mlflow. Ignore UID column

---
 modules/flowmldetection/flowmldetection.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c57a7a3581..e2aa1e0ee3 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -141,6 +141,7 @@ def process_features(self, dataset):
                 "dpkts",
                 "smac",
                 "dmac",
+                "uid",
             ]
             for field in to_drop:
                 try:

From 06bbbcfd5bdbefc4da9940c62949a5178fe58209 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:23:29 +0100
Subject: [PATCH 392/455] Re add function that alya added

---
 slips_files/core/database/database_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py
index 568e78ff45..1d339685f8 100644
--- a/slips_files/core/database/database_manager.py
+++ b/slips_files/core/database/database_manager.py
@@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs):
     def add_port(self, *args, **kwargs):
         return self.rdb.add_port(*args, **kwargs)
 
+    def get_final_state_from_flags(self, *args, **kwargs):
+        return self.rdb.get_final_state_from_flags(*args, **kwargs)
+
     def add_ips(self, *args, **kwargs):
         return self.rdb.add_ips(*args, **kwargs)
 

From 98e29a6c43277e0577924a1d8c130f300c3cdca2 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:27:23 +0100
Subject: [PATCH 393/455] Delete file that was deleted from develop

---
 slips_files/common/state_handler.py | 170 ----------------------------
 1 file changed, 170 deletions(-)
 delete mode 100644 slips_files/common/state_handler.py

diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py
deleted file mode 100644
index d0a05115bd..0000000000
--- a/slips_files/common/state_handler.py
+++ /dev/null
@@ -1,170 +0,0 @@
-from typing import Optional
-
-
-def interpret_suricata_states(state) -> Optional[str]:
-    """
-    There are different states in which a flow can be.
-    Suricata distinguishes three flow-states for TCP and two for
-     UDP. For TCP,
-    these are: New, Established and Closed,for UDP only new and
-    established.
-    For each of these states Suricata can employ different timeouts.
-    """
-    if "new" in state or "established" in state:
-        return "Established"
-    elif "closed" in state:
-        return "Not Established"
-
-
-def interpret_zeek_states(state) -> Optional[str]:
-    # We have varius type of states depending on the type of flow.
-    # For Zeek
-    if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"):
-        return "Not Established"
-    elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"):
-        return "Established"
-
-
-def interpret_argus_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    try:
-        suf = state.split("_")[1]
-    except IndexError:
-        return
-
-    if "S" in pre and "A" in pre and "S" in suf and "A" in suf:
-        """
-        Examples:
-        SA_SA
-        SR_SA
-        FSRA_SA
-        SPA_SPA
-        SRA_SPA
-        FSA_FSA
-        FSA_FSPA
-        SAEC_SPA
-        SRPA_SPA
-        FSPA_SPA
-        FSRPA_SPA
-        FSPA_FSPA
-        FSRA_FSPA
-        SRAEC_SPA
-        FSPA_FSRPA
-        FSAEC_FSPA
-        FSRPA_FSPA
-        SRPAEC_SPA
-        FSPAEC_FSPA
-        SRPAEC_FSRPA
-        """
-        return "Established"
-    elif "PA" in pre and "PA" in suf:
-        # Tipical flow that was reported in the middle
-        """
-        Examples:
-        PA_PA
-        FPA_FPA
-        """
-        return "Established"
-    elif "ECO" in pre:
-        return "ICMP Echo"
-    elif "ECR" in pre:
-        return "ICMP Reply"
-    elif "URH" in pre:
-        return "ICMP Host Unreachable"
-    elif "URP" in pre:
-        return "ICMP Port Unreachable"
-    else:
-        """
-        Examples:
-        S_RA
-        S_R
-        A_R
-        S_SA
-        SR_SA
-        FA_FA
-        SR_RA
-        SEC_RA
-        """
-        return "Not Established"
-
-
-def interpret_tcp_states(state, pkts) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "EST" in pre:
-        # TCP
-        return "Established"
-    elif "RST" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are reseted when finished and therefore are
-        # established
-        # It can happen that is reseted being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    elif "FIN" in pre:
-        # TCP. When -z B is not used in argus, states are single words.
-        # Most connections are finished with FIN when finished and
-        # therefore are established
-        # It can happen that is finished being not established, but we
-        # can't tell without -z b.
-        # So we use as heuristic the amount of packets. If <=3, then is
-        # not established because the OS retries 3 times.
-        return "Not Established" if int(pkts) <= 3 else "Established"
-    else:
-        """
-        Examples:
-        S_
-        FA_
-        PA_
-        FSA_
-        SEC_
-        SRPA_
-        """
-        return "Not Established"
-
-
-def interpret_udp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "CON" in pre:
-        # UDP
-        return "Established"
-    elif "INT" in pre:
-        # UDP trying to connect, NOT preciselly not established but also
-        # NOT 'Established'. So we considered not established because there
-        # is no confirmation of what happened.
-        return "Not Established"
-
-
-def interpret_icmp_states(state) -> Optional[str]:
-    pre = state.split("_")[0]
-    if "ECO" in pre:
-        # ICMP
-        return "Established"
-    elif "UNK" in pre:
-        # ICMP6 unknown upper layer
-        return "Established"
-
-
-def get_final_state_from_flags(state, pkts) -> str:
-    """
-    Converts the original flags from the flow, to a state that slips
-    understands
-    Works with Argus, suricata, and Bro flags
-    We receive the packets to distinguish some Reset connections
-    """
-
-    for interpreter in (
-        interpret_suricata_states,
-        interpret_zeek_states,
-        interpret_argus_states,
-        interpret_icmp_states,
-        interpret_udp_states,
-    ):
-        if interpreted_state := interpreter(state):
-            return interpreted_state
-
-    if interpreted_state := interpret_tcp_states(state, pkts):
-        return interpreted_state
-
-    return "Not Established"

From 045947ffdfb935b57f705baba86df81216eef573 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Sat, 15 Mar 2025 19:32:01 +0100
Subject: [PATCH 394/455] Flowmldetection. Fix missing db reference

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e2aa1e0ee3..9269b67012 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -154,7 +154,7 @@ def process_features(self, dataset):
             # 'Not Established', it is still 'S0' and others
             # So transform here
             dataset["state"] = dataset.apply(
-                lambda row: get_final_state_from_flags(
+                lambda row: self.db.get_final_state_from_flags(
                     row["state"], row["pkts"]
                 ),
                 axis=1,

From e793c517a247a98ea25d278c35f38c9e16c8772d Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Tue, 18 Mar 2025 12:08:08 +0100
Subject: [PATCH 395/455] Fix the training of flows with ML in new version

---
 modules/flowmldetection/flowmldetection.py | 378 +++++++++++----------
 1 file changed, 197 insertions(+), 181 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9269b67012..1cfbaf925d 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,18 +1,20 @@
+# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
+from typing import Optional
+
+# SPDX-License-Identifier: GPL-2.0-only
+import numpy
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler
 import pickle
 import pandas as pd
 import json
-import datetime
 import traceback
 import warnings
 
-
-from slips_files.common.state_handler import get_final_state_from_flags
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
 from slips_files.common.abstracts.module import IModule
-from slips_files.core.evidence_structure.evidence import (
+from slips_files.core.structures.evidence import (
     Evidence,
     ProfileID,
     TimeWindow,
@@ -21,7 +23,8 @@
     EvidenceType,
     IoCType,
     Direction,
-    IDEACategory,
+    Victim,
+    Method,
 )
 
 # Only for debbuging
@@ -52,36 +55,41 @@ def init(self):
         # Set the output queue of our database instance
         # Read the configuration
         self.read_configuration()
-        # Minum amount of new lables needed to trigger the train
-        self.minimum_lables_to_retrain = 50
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained
+        self.last_number_of_flows_when_trained = 0
         # To plot the scores of training
         # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
+        self.model_path = "./modules/flowmldetection/model.bin"
+        self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        self.label = conf.label()
 
     def train(self):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Process the labels to have only Normal and Malware
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*ormal.*$)", "Normal", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alware.*$)", "Malware", regex=True
-            )
-            self.flows.label = self.flows.label.str.replace(
-                r"(^.*alicious.*$)", "Malware", regex=True
-            )
+            # Get the flows from the DB
+            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
+            # Convert to pandas df
+            # self.flows = pd.DataFrame(self.flows)
+            # Process the features
+            # X_flow = self.process_features(self.flows)
 
-            # Separate
-            y_flow = self.flows["label"]
+            # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("label", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.label)
+            # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
 
             # Normalize this batch of data so far. This can get progressivle slow
@@ -90,7 +98,7 @@ def train(self):
             # Train
             try:
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malware", "Normal"]
+                    X_flow, y_flow, classes=["Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -113,7 +121,7 @@ def train(self):
             self.store_model()
 
         except Exception:
-            self.print("Error in train()", 0, 1)
+            self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
 
     def process_features(self, dataset):
@@ -123,7 +131,7 @@ def process_features(self, dataset):
         """
         try:
             # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"]
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
@@ -132,21 +140,20 @@ def process_features(self, dataset):
                 "appproto",
                 "daddr",
                 "saddr",
-                "ts",
-                "origstate",
+                "starttime",
                 "type_",
-                "dir_",
-                "history",
-                "dbytes",
-                "dpkts",
                 "smac",
                 "dmac",
+                "history",
                 "uid",
+                "dir_",
+                "endtime",
+                "flow_source",
             ]
             for field in to_drop:
                 try:
                     dataset = dataset.drop(field, axis=1)
-                except ValueError:
+                except (ValueError, KeyError):
                     pass
 
             # When flows are read from Slips sqlite,
@@ -155,11 +162,10 @@ def process_features(self, dataset):
             # So transform here
             dataset["state"] = dataset.apply(
                 lambda row: self.db.get_final_state_from_flags(
-                    row["state"], row["pkts"]
+                    row["state"], (row["spkts"] + row["dpkts"])
                 ),
                 axis=1,
             )
-            # dataset.state = new_state_column
 
             # Convert state to categorical
             dataset.state = dataset.state.str.replace(
@@ -193,58 +199,42 @@ def process_features(self, dataset):
             dataset.proto = dataset.proto.str.replace(
                 r"(^.*arp.*$)", "4", regex=True
             )
-            dataset.proto = dataset.proto.astype("float64")
-            try:
-                # Convert dport to float
-                dataset.dport = dataset.dport.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert sport to float
-                dataset.sport = dataset.sport.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert Dur to float
-                dataset.dur = dataset.dur.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert TotPkts to float
-                dataset.pkts = dataset.pkts.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert SrcPkts to float
-                dataset.spkts = dataset.spkts.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert TotBytes to float
-                dataset.allbytes = dataset.allbytes.astype("float")
-            except ValueError:
-                pass
-            try:
-                # Convert SrcBytes to float
-                dataset.sbytes = dataset.sbytes.astype("float")
-            except ValueError:
-                pass
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
+                dataset.proto,
+                dataset.dport,
+                dataset.sport,
+                dataset.dur,
+                dataset.pkts,
+                dataset.spkts,
+                dataset.allbytes,
+                dataset.sbytes,
+                dataset.state,
+            ]
+            for field in fields_to_convert_to_float:
+                try:
+                    field = field.astype("float64")
+                except (ValueError, AttributeError):
+                    pass
+
             return dataset
         except Exception:
             # Stop the timer
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flows(self):
+    def process_training_flows(self):
         """
-        Process all the flwos in the DB
+        Process all the flows in the DB
         Store the pandas df in self.flows
         """
         try:
             # We get all the flows so far
             # because this retraining happens in batches
-            flows: list = self.db.get_all_flows()
-
+            flows = self.db.get_all_flows()
             # Check how many different labels are in the DB
             # We need both normal and malware
             labels = self.db.get_labels()
@@ -254,48 +244,48 @@ def process_flows(self):
                 # that are fake but representative of a normal and malware flow
                 # they are only for the training process
                 # At least 1 flow of each label is required
-                # self.print(f'Amount of labeled flows: {labels}', 0, 1)
+
+                # These flows should be in the same format as the ones in the DB.
+                # Which means the satate is still SF, S0, etc.
                 flows.append(
                     {
-                        "ts": 1594417039.029793,
+                        "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",
                         "saddr": "10.7.10.101",
                         "sport": "49733",
                         "daddr": "40.70.224.145",
                         "dport": "443",
                         "proto": "tcp",
-                        "origstate": "SRPA_SPA",
-                        "state": "Established",
-                        "pkts": 84,
-                        "allbytes": 42764,
-                        "spkts": 37,
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
                         "sbytes": 25517,
+                        "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malware",
+                        "label": "Malicious",
                         "module_labels": {
-                            "flowalerts-long-connection": "Malware"
+                            "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
                 flows.append(
                     {
-                        "ts": 1382355032.706468,
+                        "starttime": 1382355032.706468,
                         "dur": "10.896695",
                         "saddr": "147.32.83.52",
                         "sport": "47956",
                         "daddr": "80.242.138.72",
                         "dport": "80",
                         "proto": "tcp",
-                        "origstate": "SRPA_SPA",
-                        "state": "Established",
-                        "pkts": 67,
-                        "allbytes": 67696,
+                        "state": "SF",
                         "spkts": 1,
+                        "dpkts": 0,
                         "sbytes": 100,
+                        "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Normal",
+                        "label": "Benign",
                         "module_labels": {
-                            "flowalerts-long-connection": "Normal"
+                            "flowalerts-long-connection": "Benign"
                         },
                     }
                 )
@@ -314,42 +304,51 @@ def process_flows(self):
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_flow(self):
+    def process_flow(self, flow_to_process: dict):
         """
         Process one flow. Only used during detection in testing
-        Store the pandas df in self.flow
+        returns the pandas df with the processed flow
         """
         try:
             # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(self.flow_dict, index=[0])
-            # Process features
+            raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
             # Update the flow to the processed version
-            self.flow = dflow
+            return dflow
         except Exception:
             # Stop the timer
             self.print("Error in process_flow()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def detect(self):
+    def detect(self, x_flow) -> Optional[numpy.ndarray]:
         """
-        Detect this flow with the current model stored
+        Detects the given flow with the current model stored
+        and returns the predection array
         """
         try:
-            # Store the real label if there is one
-            # y_flow = self.flow["label"]
-            # remove the real label column
-            self.flow = self.flow.drop("label", axis=1)
-            # remove the label predictions column of the other modules
-            X_flow = self.flow.drop("module_labels", axis=1)
+            # clean the flow
+            fields_to_drop = [
+                "label",
+                "module_labels",
+                "uid",
+                "history",
+                "dir_",
+                "endtime",
+                "flow_source",
+            ]
+            for field in fields_to_drop:
+                try:
+                    x_flow = x_flow.drop(field, axis=1)
+                except (KeyError, ValueError):
+                    pass
             # Scale the flow
-            X_flow = self.scaler.transform(X_flow)
-            pred = self.clf.predict(X_flow)
+            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
+            pred: numpy.ndarray = self.clf.predict(x_flow)
             return pred
-        except Exception:
-            # Stop the timer
-            self.print("Error in detect() X_flow:")
-            self.print(X_flow)
+        except Exception as e:
+            self.print(
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+            )
             self.print(traceback.format_exc(), 0, 1)
 
     def store_model(self):
@@ -357,10 +356,10 @@ def store_model(self):
         Store the trained model on disk
         """
         self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open("./modules/flowmldetection/model.bin", "wb") as f:
+        with open(self.model_path, "wb") as f:
             data = pickle.dumps(self.clf)
             f.write(data)
-        with open("./modules/flowmldetection/scaler.bin", "wb") as g:
+        with open(self.scaler_path, "wb") as g:
             data = pickle.dumps(self.scaler)
             g.write(data)
 
@@ -370,20 +369,23 @@ def read_model(self):
         """
         try:
             self.print("Reading the trained model from disk.", 0, 2)
-            with open("./modules/flowmldetection/model.bin", "rb") as f:
+            with open(self.model_path, "rb") as f:
                 self.clf = pickle.load(f)
             self.print("Reading the trained scaler from disk.", 0, 2)
-            with open("./modules/flowmldetection/scaler.bin", "rb") as g:
+            with open(self.scaler_path, "rb") as g:
                 self.scaler = pickle.load(g)
         except FileNotFoundError:
             # If there is no model, create one empty
-            self.print("There was no model. Creating a new empty model.", 0, 2)
+            self.print(
+                "There was no model. " "Creating a new empty model.", 0, 2
+            )
             self.clf = SGDClassifier(
                 warm_start=True, loss="hinge", penalty="l1"
             )
         except EOFError:
             self.print(
-                "Error reading model from disk. Creating a new empty model.",
+                "Error reading model from disk. "
+                "Creating a new empty model.",
                 0,
                 2,
             )
@@ -391,39 +393,36 @@ def read_model(self):
                 warm_start=True, loss="hinge", penalty="l1"
             )
 
-    def set_evidence_malicious_flow(
-        self,
-        saddr: str,
-        sport: str,
-        daddr: str,
-        dport: str,
-        twid: str,
-        uid: str,
-    ):
+    def set_evidence_malicious_flow(self, flow: dict, twid: str):
         confidence: float = 0.1
-        ip_identification = self.db.get_ip_identification(daddr)
         description = (
-            f"Malicious flow by ML. Src IP {saddr}:{sport} to "
-            f"{daddr}:{dport} {ip_identification}"
-        )
-
-        timestamp = utils.convert_format(
-            datetime.datetime.now(), utils.alerts_format
+            f"Flow with malicious characteristics by ML. Src IP"
+            f" {flow['saddr']}:{flow['sport']} to "
+            f"{flow['daddr']}:{flow['dport']}"
         )
-
+        twid_number = int(twid.replace("timewindow", ""))
         evidence: Evidence = Evidence(
             evidence_type=EvidenceType.MALICIOUS_FLOW,
             attacker=Attacker(
-                direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr
+                direction=Direction.SRC,
+                ioc_type=IoCType.IP,
+                value=flow["saddr"],
+            ),
+            victim=Victim(
+                direction=Direction.DST,
+                ioc_type=IoCType.IP,
+                value=flow["daddr"],
             ),
             threat_level=ThreatLevel.LOW,
             confidence=confidence,
             description=description,
-            profile=ProfileID(ip=saddr),
-            timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))),
-            uid=[uid],
-            timestamp=timestamp,
-            category=IDEACategory.ANOMALY_TRAFFIC,
+            profile=ProfileID(ip=flow["saddr"]),
+            timewindow=TimeWindow(twid_number),
+            uid=[flow["uid"]],
+            timestamp=flow["starttime"],
+            method=Method.AI,
+            src_port=flow["sport"],
+            dst_port=flow["dport"],
         )
 
         self.db.set_evidence(evidence)
@@ -440,17 +439,20 @@ def pre_main(self):
 
     def main(self):
         if msg := self.get_msg("new_flow"):
-            data = msg["data"]
-            data = json.loads(data)
-            # profileid = data["profileid"]
-            twid = data["twid"]
-            flow = data["flow"]
-            flow = json.loads(flow)
-            # Convert the common fields to something that can
-            # be interpreted
-            # Get the uid which is the key
-            uid = next(iter(flow))
-            self.flow_dict = json.loads(flow[uid])
+            # When a new flow arrives
+            msg = json.loads(msg["data"])
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
+            self.flow = msg["flow"]
+            # These following extra fields are expected in testing. update the original
+            # flow dict to have them
+            self.flow.update(
+                {
+                    "state": msg["interpreted_state"],
+                    "label": msg["label"],
+                    "module_labels": msg["module_labels"],
+                }
+            )
 
             if self.mode == "train":
                 # We are training
@@ -459,55 +461,69 @@ def main(self):
                 # Use labeled flows
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
                 if (
-                    sum_labeled_flows >= self.minimum_lables_to_retrain
-                    and sum_labeled_flows % self.minimum_lables_to_retrain == 1
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
                 ):
-                    # We get here every 'self.minimum_lables_to_retrain' amount of labels
-                    # So for example we retrain every 100 labels and only when we have at least 100 labels
-                    self.print(
-                        f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}."
-                    )
-                    # Process all flows in the DB and make them ready for pandas
-                    self.process_flows()
-                    # Train an algorithm
-                    self.train()
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows()
+                        # Train an algorithm
+                        self.train()
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
-                self.process_flow()
+                processed_flow = self.process_flow(self.flow)
 
-                # After processing the flow, it may happen that we delete icmp/arp/etc
-                # so the dataframe can be empty
-                if self.flow is not None and not self.flow.empty:
+                # After processing the flow, it may happen that we
+                # delete icmp/arp/etc so the dataframe can be empty
+                if processed_flow is not None and not processed_flow.empty:
                     # Predict
-                    pred = self.detect()
-                    label = self.flow_dict["label"]
+                    pred: numpy.ndarray = self.detect(processed_flow)
+                    if not pred:
+                        # an error occurred
+                        return
 
-                    # Report
+                    label = self.flow["label"]
                     if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode, and the label
-                        # is diff from the prediction, print in debug mode
+                        # If the user specified a label in test mode,
+                        # and the label is diff from the prediction,
+                        # print in debug mode
                         self.print(
-                            f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
-                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
-                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} ->'
+                            f' {self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
                             0,
                             3,
                         )
-                    if pred[0] == "Malware":
+                    if pred[0] == "Malicious":
                         # Generate an alert
-                        self.set_evidence_malicious_flow(
-                            self.flow_dict["saddr"],
-                            self.flow_dict["sport"],
-                            self.flow_dict["daddr"],
-                            self.flow_dict["dport"],
-                            twid,
-                            uid,
-                        )
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
-                            f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:'
-                            f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:'
-                            f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}',
+                            f"Prediction {pred[0]} for label {label}"
+                            f' flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} -> '
+                            f'{self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
                             0,
                             2,
                         )

From 57e144cc7fe5f3dda58e0db65af60bd23cac5aa2 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:22:38 +0100
Subject: [PATCH 396/455] flowml. If the dataset has one flow and that is
 deleted, then return empty fast.

---
 modules/flowmldetection/flowmldetection.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 1cfbaf925d..0bfaef283e 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -135,6 +135,11 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
             # For now, discard these
             to_drop = [
                 "appproto",

From 5c562206d67d1e98ff72f75af90a2c27685724c5 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:23:05 +0100
Subject: [PATCH 397/455] flowml. If the datasert is empty. Return none

---
 modules/flowmldetection/flowmldetection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 0bfaef283e..df1572fa52 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict):
             # Convert the flow to a pandas dataframe
             raw_flow = pd.DataFrame(flow_to_process, index=[0])
             dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
             # Update the flow to the processed version
             return dflow
         except Exception:

From a8c11a868b4bc7d5919344c8211c6bfac164c343 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 19 Mar 2025 14:27:16 +0100
Subject: [PATCH 398/455] First new version of the model and scaler. Not good
 yet, but working.

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1090 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644
GIT binary patch
delta 130
zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
kK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162Zxn3FXGFXLA@Pyhe`

delta 131
zcmV-}0DS+#2;>L^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8
z4q<hm<v;E**(1biC&1;CKfROVA3%9ur;SK<>_05T(v)g91VHfmFeIMvRKFpJJ~89v
lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl#

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK


From f34775220fe8c4190132be03c3dcd80d0575ece7 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:06 +0100
Subject: [PATCH 399/455] model and scaler with 1 malicious and 1 benign

---
 modules/flowmldetection/model.bin  | Bin 1090 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644
GIT binary patch
delta 132
zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSY<wU!LGn&}gUNx$3tLYnGl@A&e4S}J
zBSY*!uI-OaTr&&~Tw#uuJJhD?&}O=|yn5R|`<q--?Rc;A9e69=F=gK7d-lipbZtG?
j>K$N!g2|oC+8oRs(l<p`Iv<$4hdGq_%F%|&A}o3UHQY43

delta 131
zcmV-}0DS-C2*L;ifCQB{u>>gtTsw(FScJ$7xocP<K++%n9U}QQz-#d@sqf+^z>4|5
zn$$WZz^$}67u&zYKfDP4vYKo~K<PaAW)J{HKp5zr*LvX7zXT;Xi&zvCK#B|cdeNaM
lK-{i#qAdnGK$EKjD+pMWFI7TSSU{8B162cym;jSD1TGrII8guq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index bfba4d107224e5e6e5a1e8c8f4d463b48131d111..758909b289238ff282b2e056a9b3e83768b8472a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi%8H3jhEB

delta 290
zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U
z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu
zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc<cwm~#B-^|8$dO)gh
z=RSTUXFv*d?raebbV0J?)%52KY(b1c9N@Tpzdv9l8J2bKf<dkRFcK&rl9Sj1C|3#7
zOux9hGeC5AQV0+$-#-%10%-4K(?Bp(=SPMr(?B|Mf3GT(G(g2@E?by3D?owf0UW{U
ou|UJ?4^Yqbtw7cLsJ&lu)<1f`V88dAxIjbzY_OOIL6cVlBeWBRwg3PC


From 744a549b5ce1f37fdcd3f65ecc497cd2c89f3f93 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 13:16:27 +0100
Subject: [PATCH 400/455] cleaner jupyter

---
 modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index df1572fa52..a9b8a13585 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "endtime",
                 "flow_source",
             ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)

From 9682f8c59aaa9a372f73447a7579c1ee2bfc478c Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Thu, 20 Mar 2025 22:26:27 +0100
Subject: [PATCH 401/455] New models after 3rd ttrain

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644
GIT binary patch
delta 99
zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~Cof{2$owSaz+^KPJplbX
BD~|vG

delta 99
zcmV-p0G$8i2;>N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q;
zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG
F1TN2UERg^J

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644
GIT binary patch
delta 43
zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G|lbQl%0g;md19^W6
B5HkP(

delta 43
zcmV+`0M!5b2KolDfdU!a4zU>a=p#W80?-ZtN@qVmt(3Z<B4<G}lbQl%0g#gc19^VR
B5HSD%


From 1227487a592014bceb780bece99267747ccbafcf Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 00:08:50 +0100
Subject: [PATCH 402/455] Models after 4th train

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644
GIT binary patch
delta 120
zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
as{<<t5IC<!D(NIZlimYX0{noJHUuueR5UOE

delta 120
zcmaFD@q}YTFmr_2vYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?<Hh%FhcQcr$C8Y3pPC
z18<ZS-x*q|IJlZU`}#BRi@kOC+S)TqI1eyD!9BgtJUbI~;rz*~nYB5XJEU)ltaLsw
P`5to^Q^<kIW-NLDlQb|V

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 821344a0c69d116622b02e2a0daa1554cb5d308e..29df65342047c5a499ee3f8e602d1f47cb7e9fca 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T

delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X#jLG{%4&p`{bHVfI|sz2Ot$hvzr(m{gu&Jr}1sgu|OC|8kA
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|Rx7|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz54;5X{~-$0+hirZ#X%|CDEfPfMb;Xu)C9icy+Op{jwBi>ty4FCWD


From 237b6ef13aca3eddca3de9b5cf8f255260238bb6 Mon Sep 17 00:00:00 2001
From: Sebas Garcia <eldraco@gmail.com>
Date: Wed, 26 Mar 2025 08:28:59 +0100
Subject: [PATCH 403/455] Models of ml flow with the first good performance in
 small tests

---
 modules/flowmldetection/model.bin  | Bin 1124 -> 1124 bytes
 modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644
GIT binary patch
delta 121
zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1();
zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y<G(@KnDch(0mn<s!fm-ctwS<Lp0^
bs{<<tAstXx>?b6^limYW1McwvlQsk{8#y@u

delta 121
zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv*
zM<l=vT(CiV7%o61L>A;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P
bs{<<t5IC<!D(NIZlimYW1LXXGlQsk{_eeDq

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 29df65342047c5a499ee3f8e602d1f47cb7e9fca..17115724b9536f6093f9d72f3b58a5c22c562a9a 100644
GIT binary patch
delta 290
zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG
zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa
zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac
zf^|*Gc0elJHxDoXo<X&kLG{%4&p`{bHVfI|sz2Uv$hvzr(m{gu&Jr}1sgu|OC|8h9
z&D_B~i9nx5TWFn3t3OFIt}1{vt3Z{LgvGX?u|R!8|2|A?L_o(e@1J}aGeCBvuU^0C
o#Xz85;5X{~-$0+hirZ#X%|CGFfPfMb;Xu)C9icy+Op{jwBjCh|6aWAK

delta 290
zcmV+-0p0%k2KolDvjGB90h6@>PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB
z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m
z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A
za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o
zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9
ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}Dxj<tAT~->ML6cVlBO2|D9RL6T


From 43aae2e88f823e4a3d5e751b02b521d5487d231e Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:49:23 +0000
Subject: [PATCH 404/455] Add plot for flowml train scores

---
 modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 modules/flowmldetection/plot_train_score.py

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
new file mode 100644
index 0000000000..0b5b5b72ba
--- /dev/null
+++ b/modules/flowmldetection/plot_train_score.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import re
+import sys
+
+def plot_log_data(file_path):
+    # Read the log data from the file
+    with open(file_path, 'r') as file:
+        log_data = file.read()
+
+    # Define regex pattern to extract relevant data from each line
+    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
+
+    # Parse the log file
+    data = re.findall(pattern, log_data)
+
+    # Convert data to a DataFrame
+    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
+    df = df.astype({
+        "Background": int,
+        "Benign": int,
+        "Malicious": int,
+        "Total labels": float,
+        "Score": float
+    })
+
+    # Plotting the values
+    fig, ax1 = plt.subplots(figsize=(10, 6))
+
+    # Plotting Score on the left y-axis
+    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
+    ax1.set_xlabel('Index')
+    ax1.set_ylabel('Score', color='tab:blue')
+    ax1.tick_params(axis='y', labelcolor='tab:blue')
+
+    # Create the second y-axis for the Total labels
+    ax2 = ax1.twinx()
+    ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
+    ax2.set_ylabel('Total labels', color='tab:red')
+    ax2.tick_params(axis='y', labelcolor='tab:red')
+
+    # Adding title and legend
+    plt.title('Log Data Visualization')
+    fig.tight_layout()
+
+    # Save plot to a PNG file
+    plt.savefig('log_data_plot_with_two_scales.png')
+
+    # Display the plot
+    plt.show()
+
+# Make sure the file path is passed as an argument
+if len(sys.argv) < 2:
+    print("Please provide the path to the log file as a parameter.")
+else:
+    plot_log_data(sys.argv[1])

From 6f045c72b8ac57f7b866f8cd14b0fe98fc668a9c Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:04 +0000
Subject: [PATCH 405/455] Add a log file to store the training data output

---
 modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index a9b8a13585..8a319cb4e2 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -68,12 +68,29 @@ def init(self):
         self.model_path = "./modules/flowmldetection/model.bin"
         self.scaler_path = "./modules/flowmldetection/scaler.bin"
 
+        # Initialize the training log file
+        self.training_log_path = "./modules/flowmldetection/training.log"
+        with open(self.training_log_path, "w") as log_file:
+            log_file.write("Training Log Initialized\n")
+
     def read_configuration(self):
         conf = ConfigParser()
         self.mode = conf.get_ml_mode()
+        # This is the global label in the configuration,
+        # in case the flows do not have a label themselves
         self.label = conf.label()
 
-    def train(self):
+    def write_to_training_log(self, message: str):
+        """
+        Write a message to the training log file.
+        """
+        try:
+            with open(self.training_log_path, "a") as log_file:
+                log_file.write(message + "\n")
+        except Exception as e:
+            self.print(f"Error writing to training log: {e}", 0, 1)
+
+    def train(self, sum_labeled_flows):
         """
         Train a model based on the flows we receive and the labels
         """

From 8a42f14ad61b5230c8426dbfef1f8bc0bd839a0b Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:32 +0000
Subject: [PATCH 406/455] Store data in the log file of training

---
 modules/flowmldetection/flowmldetection.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 8a319cb4e2..28e8e7eca8 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -137,9 +137,13 @@ def train(self, sum_labeled_flows):
             # Store the models on disk
             self.store_model()
 
+            # Log training information
+            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
+            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
+            self.write_to_training_log("Error occurred during training.")
 
     def process_features(self, dataset):
         """

From f4dd77bff3cdb4428269ab005fb0c4b451efc9f8 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:50:53 +0000
Subject: [PATCH 407/455] better comments

---
 modules/flowmldetection/flowmldetection.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 28e8e7eca8..676907a6df 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -59,10 +59,9 @@ def init(self):
         self.minimum_labels_to_start_train = 50
         # Minum amount of new labels needed to retrain
         self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained
+        # The number of flows when last trained. Used internally only to know
+        # when to retrain
         self.last_number_of_flows_when_trained = 0
-        # To plot the scores of training
-        # self.scores = []
         # The scaler trained during training and to use during testing
         self.scaler = StandardScaler()
         self.model_path = "./modules/flowmldetection/model.bin"

From 7e72af1c156068ff3e4b91217d53830c9a4f6262 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:51:30 +0000
Subject: [PATCH 408/455] Fix issue not dropping detailed labels

---
 modules/flowmldetection/flowmldetection.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 676907a6df..483c6a1d69 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -94,23 +94,19 @@ def train(self, sum_labeled_flows):
         Train a model based on the flows we receive and the labels
         """
         try:
-            # Get the flows from the DB
-            # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid)
-            # Convert to pandas df
-            # self.flows = pd.DataFrame(self.flows)
-            # Process the features
-            # X_flow = self.process_features(self.flows)
-
             # Create X_flow with the current flows minus the label
-            X_flow = self.flows.drop("label", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.label)
+            X_flow = self.flows.drop("ground_truth_label", axis=1)
+            # Drop the detailed labels
+            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
             # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
 
             # Normalize this batch of data so far. This can get progressivle slow
             X_flow = self.scaler.fit_transform(X_flow)
 
+
             # Train
             try:
                 self.clf.partial_fit(

From beaf213d6167832d8c3f1e98eb6bc98d2e40d29d Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:51:53 +0000
Subject: [PATCH 409/455] Fix issue that not all labels sere given to the
 partial fit

---
 modules/flowmldetection/flowmldetection.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 483c6a1d69..b06c9a54e3 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -109,8 +109,9 @@ def train(self, sum_labeled_flows):
 
             # Train
             try:
+                # Online incremental learning
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Malicious", "Benign"]
+                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
                 )
             except Exception:
                 self.print("Error while calling clf.train()")

From 5b290a7fc764e26766d3519bbafe54b43cdae603 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:52:08 +0000
Subject: [PATCH 410/455] count partial labels in this epoch

---
 modules/flowmldetection/flowmldetection.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index b06c9a54e3..184a6b3455 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -106,6 +106,12 @@ def train(self, sum_labeled_flows):
             # Normalize this batch of data so far. This can get progressivle slow
             X_flow = self.scaler.fit_transform(X_flow)
 
+            # Count the number of labels of each type in this epoc
+            epoch_label_counts = {
+                "Background": (y_flow == "Background").sum(),
+                "Malicious": (y_flow == "Malicious").sum(),
+                "Benign": (y_flow == "Benign").sum(),
+            }
 
             # Train
             try:

From 1cb44821b4885c0a648bf5183dfdde83c4d71cc8 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:55:09 +0000
Subject: [PATCH 411/455] Dont print training in screen

---
 modules/flowmldetection/flowmldetection.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 184a6b3455..4dd8191f87 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -126,15 +126,8 @@ def train(self, sum_labeled_flows):
             # See score so far in training
             score = self.clf.score(X_flow, y_flow)
 
-            # To debug the training score
-            # self.scores.append(score)
-
-            self.print(f"	Training Score: {score}", 0, 1)
-            # self.print(f'    Model Parameters: {self.clf.coef_}')
-
-            # Debug code to store a plot in a png of the scores
-            # plt.plot(self.scores)
-            # plt.savefig('train-scores.png')
+            #self.print(f"	Training Score: {score}", 1, 0)
+            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
 
             # Store the models on disk
             self.store_model()

From a38524eada2e31b202392335cf470a1b08bbd25f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:55:28 +0000
Subject: [PATCH 412/455] Add function to write to train log

---
 modules/flowmldetection/flowmldetection.py | 34 ++++++++++++----------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4dd8191f87..679e7c0cc9 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -247,28 +247,28 @@ def process_features(self, dataset):
             self.print("Error in process_features()")
             self.print(traceback.format_exc(), 0, 1)
 
-    def process_training_flows(self):
+    def process_training_flows(self, last_number_of_flows_when_trained):
         """
-        Process all the flows in the DB
+        Process only the new flows in the DB since the last training.
         Store the pandas df in self.flows
         """
         try:
+            # Ensure the index is an integer
+            if last_number_of_flows_when_trained is None:
+                last_number_of_flows_when_trained = 0
+            else:
+                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+
             # We get all the flows so far
-            # because this retraining happens in batches
             flows = self.db.get_all_flows()
-            # Check how many different labels are in the DB
-            # We need both normal and malware
+            # Only process new flows since last training
+            new_flows = flows[last_number_of_flows_when_trained:]
+
+            # Check how many **different** labels are in the DB
             labels = self.db.get_labels()
             if len(labels) == 1:
-                # Only 1 label has flows
-                # There are not enough different labels, so insert two flows
-                # that are fake but representative of a normal and malware flow
-                # they are only for the training process
-                # At least 1 flow of each label is required
-
-                # These flows should be in the same format as the ones in the DB.
-                # Which means the satate is still SF, S0, etc.
-                flows.append(
+                # Insert fake flows for both classes if needed
+                new_flows.append(
                     {
                         "starttime": 1594417039.029793,
                         "dur": "1.9424750804901123",
@@ -358,6 +358,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "dir_",
                 "endtime",
                 "flow_source",
+                "ground_truth_label",  # todo now we can use them
+                "detailed_ground_truth_label",
             ]
             # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
             # Error
@@ -502,11 +504,11 @@ def main(self):
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
 
-                # The min labels to retrain is the min number of flows
+                # The min labels to retrain is the min number of flows 
                 # we should have seen so far in this capture to start training
                 # This is so we dont _start_ training with only 1 flow
 
-                # Once we are over the start minimum, the second condition is
+                # Once we are over the start minimum, the second condition is 
                 # to force to retrain every a minimum_labels_to_retrain number
                 # of flows. So we dont retrain every 1 flow.
                 if (

From 9a888b7055b804316775159042255e84a191869c Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:57:27 +0000
Subject: [PATCH 413/455] Fix label in dummy flow

---
 modules/flowmldetection/flowmldetection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 679e7c0cc9..95c9b82a74 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 25517,
                         "dbytes": 17247,
                         "appproto": "ssl",
-                        "label": "Malicious",
+                        "ground_truth_label": "Malicious",
                         "module_labels": {
                             "flowalerts-long-connection": "Malicious"
                         },
                     }
                 )
-                flows.append(
+                new_flows.append(
                     {
                         "starttime": 1382355032.706468,
                         "dur": "10.896695",

From 8f8a5443834244a4522f80ef17cdb073d3976bc4 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:57:39 +0000
Subject: [PATCH 414/455] Fix dummy flow

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 95c9b82a74..5ea48fbc40 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 100,
                         "dbytes": 67596,
                         "appproto": "http",
-                        "label": "Benign",
+                        "ground_truth_label": "Benign",
                         "module_labels": {
                             "flowalerts-long-connection": "Benign"
                         },

From d27350f5678356eda2dfdea7722c4a2567a3a93f Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 16:58:28 +0000
Subject: [PATCH 415/455] Rename variable

---
 modules/flowmldetection/flowmldetection.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 5ea48fbc40..ff68b8a270 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         },
                     }
                 )
-                # If there are enough flows, we dont insert them anymore
 
             # Convert to pandas df
-            df_flows = pd.DataFrame(flows)
+            df_flows = pd.DataFrame(new_flows)
 
             # Process features
             df_flows = self.process_features(df_flows)
@@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained):
             # Update the flow to the processed version
             self.flows = df_flows
         except Exception:
-            # Stop the timer
             self.print("Error in process_flows()")
             self.print(traceback.format_exc(), 0, 1)
 

From 4242689cf0a9b71ba877668080c5f7907d944d45 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:00:32 +0000
Subject: [PATCH 416/455] Fix dummy flow label

---
 modules/flowmldetection/flowmldetection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index ff68b8a270..6b41b40298 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -356,7 +356,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "dir_",
                 "endtime",
                 "flow_source",
-                "ground_truth_label",  # todo now we can use them
+                "ground_truth_label",
                 "detailed_ground_truth_label",
             ]
             # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.

From 6d561e03770607761204e82b027fc8f167c0887e Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:00:47 +0000
Subject: [PATCH 417/455] Pass values to train function

---
 modules/flowmldetection/flowmldetection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6b41b40298..4d66aab855 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -521,9 +521,9 @@ def main(self):
                         )
                         # Process all flows in the DB and make them ready
                         # for pandas
-                        self.process_training_flows()
+                        self.process_training_flows(self.last_number_of_flows_when_trained)
                         # Train an algorithm
-                        self.train()
+                        self.train(sum_labeled_flows)
                         self.last_number_of_flows_when_trained = sum_labeled_flows
 
             elif self.mode == "test":

From 50d892127da4c1bbaf150997363c3cc9b1d41f9a Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:01:47 +0000
Subject: [PATCH 418/455] import os

---
 modules/flowmldetection/flowmldetection.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 4d66aab855..766178e127 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,6 +10,7 @@
 import json
 import traceback
 import warnings
+import os
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils

From a7cf82be948b4ff673f189d62d89276b1b385471 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:03:53 +0000
Subject: [PATCH 419/455] Delete old comments

---
 modules/flowmldetection/flowmldetection.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 766178e127..6c3bfc1275 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -28,10 +28,6 @@
     Method,
 )
 
-# Only for debbuging
-# from matplotlib import pyplot as plt
-
-
 # This horrible hack is only to stop sklearn from printing those warnings
 def warn(*args, **kwargs):
     pass

From 06add4106a0c833a368dad445a094a0a76f11f3d Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:13:22 +0000
Subject: [PATCH 420/455] Fix plots

---
 modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++-----
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 0b5b5b72ba..359df04eff 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -2,6 +2,8 @@
 import matplotlib.pyplot as plt
 import re
 import sys
+import argparse
+import os
 
 def plot_log_data(file_path):
     # Read the log data from the file
@@ -24,33 +26,59 @@ def plot_log_data(file_path):
         "Score": float
     })
 
+    # Get the directory of the log file to store the plot in the same folder
+    dir_name = os.path.dirname(file_path)
+    plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png')
+
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))
 
-    # Plotting Score on the left y-axis
+    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
     ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
     ax1.set_xlabel('Index')
     ax1.set_ylabel('Score', color='tab:blue')
+    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
     ax1.tick_params(axis='y', labelcolor='tab:blue')
 
-    # Create the second y-axis for the Total labels
+    # Create the second y-axis for the Background, Benign, Malicious, Total labels
     ax2 = ax1.twinx()
+    ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
+    ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
+    ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
     ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
-    ax2.set_ylabel('Total labels', color='tab:red')
+    ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red')
+    
+    # Set appropriate scale for right y-axis based on the data
+    ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max())
     ax2.tick_params(axis='y', labelcolor='tab:red')
 
     # Adding title and legend
     plt.title('Log Data Visualization')
     fig.tight_layout()
 
-    # Save plot to a PNG file
-    plt.savefig('log_data_plot_with_two_scales.png')
+    # Adding the legend with increased space for readability
+    ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
+    ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small')
+
+    # Increase right margin for better readability of legend
+    plt.subplots_adjust(right=0.75)
+
+    # Save plot to the same folder as the log file
+    plt.savefig(plot_file)
 
     # Display the plot
     plt.show()
 
-# Make sure the file path is passed as an argument
-if len(sys.argv) < 2:
-    print("Please provide the path to the log file as a parameter.")
-else:
-    plot_log_data(sys.argv[1])
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
+    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
+    
+    # Handle -h / --help
+    args = parser.parse_args()
+
+    # Call the function to process the log file
+    plot_log_data(args.log_file)
+
+if __name__ == "__main__":
+    main()

From f5160524451637eb0ad20db0b277395d0683f368 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:14:58 +0000
Subject: [PATCH 421/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 359df04eff..c7f374a7fe 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -40,18 +40,21 @@ def plot_log_data(file_path):
     ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
     ax1.tick_params(axis='y', labelcolor='tab:blue')
 
-    # Create the second y-axis for the Background, Benign, Malicious, Total labels
+    # Create the second y-axis for the Background, Benign, Malicious
     ax2 = ax1.twinx()
     ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
     ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
     ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
-    ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red')
-    ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red')
+    ax2.set_ylabel('Background, Benign, Malicious', color='tab:red')
     
     # Set appropriate scale for right y-axis based on the data
-    ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max())
+    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
     ax2.tick_params(axis='y', labelcolor='tab:red')
 
+    # Annotating Total labels as text on the plot
+    for i, value in enumerate(df["Total labels"]):
+        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
+
     # Adding title and legend
     plt.title('Log Data Visualization')
     fig.tight_layout()

From d1b2bd882e7718d8923436b5485fe0e5398b4383 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:16:23 +0000
Subject: [PATCH 422/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index c7f374a7fe..4099c47c1e 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -42,10 +42,10 @@ def plot_log_data(file_path):
 
     # Create the second y-axis for the Background, Benign, Malicious
     ax2 = ax1.twinx()
-    ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--')
-    ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--')
-    ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--')
-    ax2.set_ylabel('Background, Benign, Malicious', color='tab:red')
+    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
+    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
+    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
+    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
     
     # Set appropriate scale for right y-axis based on the data
     ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
@@ -56,7 +56,7 @@ def plot_log_data(file_path):
         ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
 
     # Adding title and legend
-    plt.title('Log Data Visualization')
+    plt.title('Training performance')
     fig.tight_layout()
 
     # Adding the legend with increased space for readability

From ba0e9f1a8cc05c044b76810c1e9fa164492732a5 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 17:24:43 +0000
Subject: [PATCH 423/455] Fix plot

---
 modules/flowmldetection/plot_train_score.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
index 4099c47c1e..8437e968ac 100644
--- a/modules/flowmldetection/plot_train_score.py
+++ b/modules/flowmldetection/plot_train_score.py
@@ -59,12 +59,12 @@ def plot_log_data(file_path):
     plt.title('Training performance')
     fig.tight_layout()
 
-    # Adding the legend with increased space for readability
-    ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
-    ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small')
+    # Move both legends further to the right
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
 
     # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.75)
+    plt.subplots_adjust(right=0.7)
 
     # Save plot to the same folder as the log file
     plt.savefig(plot_file)

From e089bec8ae86ab1fb938a03b08430b6eace488e2 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:02:34 +0000
Subject: [PATCH 424/455] Plot testing performance from a log

---
 .../plot_testing_performance.py               | 116 ++++--------------
 1 file changed, 24 insertions(+), 92 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 6865415cdf..a38c7f0598 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -1,7 +1,6 @@
 import matplotlib.pyplot as plt
 import sys
 import numpy as np
-import argparse
 
 def process_file(file_path):
     # Initialize the counters for the values
@@ -50,108 +49,41 @@ def process_file(file_path):
     
     return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
 
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number):
-    # Separate the values into two groups based on their proximity to 0 or 1
-    close_to_0 = {
-        'FPR': [], 'FNR': []
-    }
-    close_to_1 = {
-        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
-    }
-    
-    # Categorize the metrics into two groups
-    for i in range(len(FPR_values)):
-        close_to_0['FPR'].append(FPR_values[i])
-        close_to_0['FNR'].append(FNR_values[i])
-        
-        close_to_1['TNR'].append(TNR_values[i])
-        close_to_1['TPR'].append(TPR_values[i])
-        close_to_1['F1'].append(F1_values[i])
-        close_to_1['accuracy'].append(accuracy_values[i])
-        close_to_1['precision'].append(precision_values[i])
-        close_to_1['MCC'].append(MCC_values[i])
-        close_to_1['recall'].append(recall_values[i])
-
-    # Plot metrics for values close to 0 (linear scale)
-    plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True)
-    
-    # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False)
-
-    # Print the final values
-    print("\nFinal Metric Values for Experiment", experiment_number)
-    print(f"Final FPR: {FPR_values[-1]:.4f}")
-    print(f"Final FNR: {FNR_values[-1]:.4f}")
-    print(f"Final TNR: {TNR_values[-1]:.4f}")
-    print(f"Final TPR: {TPR_values[-1]:.4f}")
-    print(f"Final F1 Score: {F1_values[-1]:.4f}")
-    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
-    print(f"Final Precision: {precision_values[-1]:.4f}")
-    print(f"Final MCC: {MCC_values[-1]:.4f}")
-    print(f"Final Recall: {recall_values[-1]:.4f}")
-
-def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False):
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+    # Create the plot
     plt.figure(figsize=(12, 8))
     
-    # Only plot the metrics that exist in the dictionary
-    if 'FPR' in metrics_dict:
-        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
-    if 'FNR' in metrics_dict:
-        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
-    if 'TNR' in metrics_dict:
-        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
-    if 'TPR' in metrics_dict:
-        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
-    if 'F1' in metrics_dict:
-        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
-    if 'accuracy' in metrics_dict:
-        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
-    if 'precision' in metrics_dict:
-        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
-    if 'MCC' in metrics_dict:
-        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
-    if 'recall' in metrics_dict:
-        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
-
-    # If the plot is close to 1, apply log scale
-    if not is_close_to_0:
-        plt.yscale('log')
-
-    # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series
-    if is_close_to_0:
-        min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR']))
-        max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR']))
-        
-        # Avoid log(0), so set the minimum limit a little higher than zero
-        if min_val == 0:
-            min_val = 1e-4  # Avoid zero values on the logarithmic scale
-
-        plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
-
-    # Add the experiment number to the plot title
+    # Plot each metric
+    plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o')
+    plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o')
+    plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o')
+    plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o')
+    plt.plot(F1_values, label='F1 Score', marker='o')
+    plt.plot(accuracy_values, label='Accuracy', marker='o')
+    plt.plot(precision_values, label='Precision', marker='o')
+    plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
+    plt.plot(recall_values, label='Recall (TPR)', marker='o')
+    
+    # Add labels and title
     plt.xlabel('Index')
     plt.ylabel('Metric Value')
-    plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time')
+    plt.title('Evaluation Metrics Over Time')
+    
+    # Add a legend
     plt.legend()
     
-    # Save the plot
-    plt.savefig(output_filename)
+    # Save the plot as a PNG file
+    plt.savefig('metrics_plot.png')
     plt.close()
 
 def main():
-    # Set up argument parsing
-    parser = argparse.ArgumentParser(description='Plot testing performance metrics.')
-    parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file')
-    parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number')
-
-    args = parser.parse_args()
-    
-    file_path = args.file
-    experiment_number = args.experiment
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <file_path>")
+        sys.exit(1)
     
+    file_path = sys.argv[1]
     FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
 
 if __name__ == "__main__":
     main()

From 499f08bdbda9d16604b33df6e0b60c54cdec709d Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:04:32 +0000
Subject: [PATCH 425/455] Fix the plot

---
 modules/flowmldetection/plot_testing_performance.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index a38c7f0598..fac0acd64a 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
     plt.plot(recall_values, label='Recall (TPR)', marker='o')
     
+    # Set logarithmic scale on the y-axis
+    plt.yscale('log')
+    
     # Add labels and title
     plt.xlabel('Index')
-    plt.ylabel('Metric Value')
-    plt.title('Evaluation Metrics Over Time')
+    plt.ylabel('Metric Value (Log Scale)')
+    plt.title('Evaluation Metrics Over Time (Log Scale)')
     
     # Add a legend
     plt.legend()
     
     # Save the plot as a PNG file
-    plt.savefig('metrics_plot.png')
+    plt.savefig('metrics_plot_log_scale.png')
     plt.close()
 
 def main():

From 9007dfbdaccdaaa852e6c1e30e93746fb6052478 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:12:40 +0000
Subject: [PATCH 426/455] Fix the plots

---
 .../plot_testing_performance.py               | 76 ++++++++++++++-----
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index fac0acd64a..5581c72cd4 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -50,33 +50,66 @@ def process_file(file_path):
     return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
 
 def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
-    # Create the plot
-    plt.figure(figsize=(12, 8))
+    # Separate the values into two groups based on their proximity to 0 or 1
+    close_to_0 = {
+        'FPR': [], 'FNR': []
+    }
+    close_to_1 = {
+        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
+    }
     
-    # Plot each metric
-    plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o')
-    plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o')
-    plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o')
-    plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o')
-    plt.plot(F1_values, label='F1 Score', marker='o')
-    plt.plot(accuracy_values, label='Accuracy', marker='o')
-    plt.plot(precision_values, label='Precision', marker='o')
-    plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o')
-    plt.plot(recall_values, label='Recall (TPR)', marker='o')
+    # Categorize the metrics into two groups
+    for i in range(len(FPR_values)):
+        close_to_0['FPR'].append(FPR_values[i])
+        close_to_0['FNR'].append(FNR_values[i])
+        
+        close_to_1['TNR'].append(TNR_values[i])
+        close_to_1['TPR'].append(TPR_values[i])
+        close_to_1['F1'].append(F1_values[i])
+        close_to_1['accuracy'].append(accuracy_values[i])
+        close_to_1['precision'].append(precision_values[i])
+        close_to_1['MCC'].append(MCC_values[i])
+        close_to_1['recall'].append(recall_values[i])
+
+    # Plot metrics for values close to 0
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png')
     
-    # Set logarithmic scale on the y-axis
-    plt.yscale('log')
+    # Plot metrics for values close to 1
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+
+def plot_single_group(metrics_dict, output_filename):
+    plt.figure(figsize=(12, 8))
     
-    # Add labels and title
+    # Only plot the metrics that exist in the dictionary
+    if 'FPR' in metrics_dict:
+        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
+    if 'FNR' in metrics_dict:
+        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
+    if 'TNR' in metrics_dict:
+        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
+    if 'TPR' in metrics_dict:
+        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
+    if 'F1' in metrics_dict:
+        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
+    if 'accuracy' in metrics_dict:
+        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
+    if 'precision' in metrics_dict:
+        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
+    if 'MCC' in metrics_dict:
+        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
+    if 'recall' in metrics_dict:
+        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
+
+    # Apply log scale by default
+    plt.yscale('log')
+
     plt.xlabel('Index')
-    plt.ylabel('Metric Value (Log Scale)')
-    plt.title('Evaluation Metrics Over Time (Log Scale)')
-    
-    # Add a legend
+    plt.ylabel('Metric Value')
+    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
     plt.legend()
     
-    # Save the plot as a PNG file
-    plt.savefig('metrics_plot_log_scale.png')
+    # Save the plot
+    plt.savefig(output_filename)
     plt.close()
 
 def main():
@@ -85,6 +118,7 @@ def main():
         sys.exit(1)
     
     file_path = sys.argv[1]
+    
     FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
     plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
 

From fb2e163811d92a22203ad14e5462c74c8514c6cf Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:16:50 +0000
Subject: [PATCH 427/455] Fix plot

---
 .../plot_testing_performance.py               | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 5581c72cd4..8f9e12cd86 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png')
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1
     plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
 
-def plot_single_group(metrics_dict, output_filename):
+    # Print the final values
+    print("\nFinal Metric Values:")
+    print(f"Final FPR: {FPR_values[-1]:.4f}")
+    print(f"Final FNR: {FNR_values[-1]:.4f}")
+    print(f"Final TNR: {TNR_values[-1]:.4f}")
+    print(f"Final TPR: {TPR_values[-1]:.4f}")
+    print(f"Final F1 Score: {F1_values[-1]:.4f}")
+    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
+    print(f"Final Precision: {precision_values[-1]:.4f}")
+    print(f"Final MCC: {MCC_values[-1]:.4f}")
+    print(f"Final Recall: {recall_values[-1]:.4f}")
+
+def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     plt.figure(figsize=(12, 8))
     
     # Only plot the metrics that exist in the dictionary
@@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename):
     # Apply log scale by default
     plt.yscale('log')
 
+    # If the plot is close to 0, set custom ticks
+    if is_close_to_0:
+        # Manually set more Y-ticks for better visibility
+        plt.ylim(0.0001, 1)  # Set Y-axis limits between 0.0001 and 1
+        plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1'])  # Adjust Y-ticks
+
     plt.xlabel('Index')
     plt.ylabel('Metric Value')
     plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')

From acac48b8feccf08958d19f68d0375bb4bb7e6df1 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:20:22 +0000
Subject: [PATCH 428/455] Fix plots

---
 modules/flowmldetection/flowmldetection.py | 709 +++++----------------
 1 file changed, 143 insertions(+), 566 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 6c3bfc1275..37f0761109 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,566 +1,143 @@
-# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
-from typing import Optional
-
-# SPDX-License-Identifier: GPL-2.0-only
-import numpy
-from sklearn.linear_model import SGDClassifier
-from sklearn.preprocessing import StandardScaler
-import pickle
-import pandas as pd
-import json
-import traceback
-import warnings
-import os
-
-from slips_files.common.parsers.config_parser import ConfigParser
-from slips_files.common.slips_utils import utils
-from slips_files.common.abstracts.module import IModule
-from slips_files.core.structures.evidence import (
-    Evidence,
-    ProfileID,
-    TimeWindow,
-    Attacker,
-    ThreatLevel,
-    EvidenceType,
-    IoCType,
-    Direction,
-    Victim,
-    Method,
-)
-
-# This horrible hack is only to stop sklearn from printing those warnings
-def warn(*args, **kwargs):
-    pass
-
-
-warnings.warn = warn
-
-
-class FlowMLDetection(IModule):
-    # Name: short name of the module. Do not use spaces
-    name = "Flow ML Detection"
-    description = (
-        "Train or test a Machine Learning model to detect malicious flows"
-    )
-    authors = ["Sebastian Garcia"]
-
-    def init(self):
-        # Subscribe to the channel
-        self.c1 = self.db.subscribe("new_flow")
-        self.channels = {"new_flow": self.c1}
-        self.fieldseparator = self.db.get_field_separator()
-        # Set the output queue of our database instance
-        # Read the configuration
-        self.read_configuration()
-        # Minum amount of new labels needed to start the train
-        self.minimum_labels_to_start_train = 50
-        # Minum amount of new labels needed to retrain
-        self.minimum_labels_to_retrain = 50
-        # The number of flows when last trained. Used internally only to know
-        # when to retrain
-        self.last_number_of_flows_when_trained = 0
-        # The scaler trained during training and to use during testing
-        self.scaler = StandardScaler()
-        self.model_path = "./modules/flowmldetection/model.bin"
-        self.scaler_path = "./modules/flowmldetection/scaler.bin"
-
-        # Initialize the training log file
-        self.training_log_path = "./modules/flowmldetection/training.log"
-        with open(self.training_log_path, "w") as log_file:
-            log_file.write("Training Log Initialized\n")
-
-    def read_configuration(self):
-        conf = ConfigParser()
-        self.mode = conf.get_ml_mode()
-        # This is the global label in the configuration,
-        # in case the flows do not have a label themselves
-        self.label = conf.label()
-
-    def write_to_training_log(self, message: str):
-        """
-        Write a message to the training log file.
-        """
-        try:
-            with open(self.training_log_path, "a") as log_file:
-                log_file.write(message + "\n")
-        except Exception as e:
-            self.print(f"Error writing to training log: {e}", 0, 1)
-
-    def train(self, sum_labeled_flows):
-        """
-        Train a model based on the flows we receive and the labels
-        """
-        try:
-            # Create X_flow with the current flows minus the label
-            X_flow = self.flows.drop("ground_truth_label", axis=1)
-            # Drop the detailed labels
-            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
-            # Drop the module_labels
-            X_flow = X_flow.drop("module_labels", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
-
-            # Normalize this batch of data so far. This can get progressivle slow
-            X_flow = self.scaler.fit_transform(X_flow)
-
-            # Count the number of labels of each type in this epoc
-            epoch_label_counts = {
-                "Background": (y_flow == "Background").sum(),
-                "Malicious": (y_flow == "Malicious").sum(),
-                "Benign": (y_flow == "Benign").sum(),
-            }
-
-            # Train
-            try:
-                # Online incremental learning
-                self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
-                )
-            except Exception:
-                self.print("Error while calling clf.train()")
-                self.print(traceback.format_exc(), 0, 1)
-
-            # See score so far in training
-            score = self.clf.score(X_flow, y_flow)
-
-            #self.print(f"	Training Score: {score}", 1, 0)
-            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
-
-            # Store the models on disk
-            self.store_model()
-
-            # Log training information
-            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
-            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
-        except Exception:
-            self.print("Error in train().", 0, 1)
-            self.print(traceback.format_exc(), 0, 1)
-            self.write_to_training_log("Error occurred during training.")
-
-    def process_features(self, dataset):
-        """
-        Discards some features of the dataset and can create new.
-        Clean the dataset
-        """
-        try:
-            # Discard some type of flows that dont have ports
-            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
-            for proto in to_discard:
-                dataset = dataset[dataset.proto != proto]
-
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
-            if dataset.empty:
-                # DataFrame is empty now, so return empty
-                return dataset
-
-            # For now, discard these
-            to_drop = [
-                "appproto",
-                "daddr",
-                "saddr",
-                "starttime",
-                "type_",
-                "smac",
-                "dmac",
-                "history",
-                "uid",
-                "dir_",
-                "endtime",
-                "flow_source",
-            ]
-            for field in to_drop:
-                try:
-                    dataset = dataset.drop(field, axis=1)
-                except (ValueError, KeyError):
-                    pass
-
-            # When flows are read from Slips sqlite,
-            # the state is not transformed to 'Established' or
-            # 'Not Established', it is still 'S0' and others
-            # So transform here
-            dataset["state"] = dataset.apply(
-                lambda row: self.db.get_final_state_from_flags(
-                    row["state"], (row["spkts"] + row["dpkts"])
-                ),
-                axis=1,
-            )
-
-            # Convert state to categorical
-            dataset.state = dataset.state.str.replace(
-                r"(^.*Not Established.*$)", "0", regex=True
-            )
-            dataset.state = dataset.state.str.replace(
-                r"(^.*Established.*$)", "1", regex=True
-            )
-
-            # Convert categories to floats
-            dataset.state = dataset.state.astype("float64")
-
-            # Convert proto to categorical. For now we only have few states, so we can hardcode...
-            # We dont use the data to create categories because in testing mode
-            # we dont see all the protocols
-            # Also we dont store the Categorizer because the user can retrain
-            # with its own data.
-            dataset.proto = dataset.proto.str.lower()
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*tcp.*$)", "0", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*udp.*$)", "1", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*icmp.*$)", "2", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*icmp-ipv6.*$)", "3", regex=True
-            )
-            dataset.proto = dataset.proto.str.replace(
-                r"(^.*arp.*$)", "4", regex=True
-            )
-
-            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
-            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
-
-            fields_to_convert_to_float = [
-                dataset.proto,
-                dataset.dport,
-                dataset.sport,
-                dataset.dur,
-                dataset.pkts,
-                dataset.spkts,
-                dataset.allbytes,
-                dataset.sbytes,
-                dataset.state,
-            ]
-            for field in fields_to_convert_to_float:
-                try:
-                    field = field.astype("float64")
-                except (ValueError, AttributeError):
-                    pass
-
-            return dataset
-        except Exception:
-            # Stop the timer
-            self.print("Error in process_features()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def process_training_flows(self, last_number_of_flows_when_trained):
-        """
-        Process only the new flows in the DB since the last training.
-        Store the pandas df in self.flows
-        """
-        try:
-            # Ensure the index is an integer
-            if last_number_of_flows_when_trained is None:
-                last_number_of_flows_when_trained = 0
-            else:
-                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
-
-            # We get all the flows so far
-            flows = self.db.get_all_flows()
-            # Only process new flows since last training
-            new_flows = flows[last_number_of_flows_when_trained:]
-
-            # Check how many **different** labels are in the DB
-            labels = self.db.get_labels()
-            if len(labels) == 1:
-                # Insert fake flows for both classes if needed
-                new_flows.append(
-                    {
-                        "starttime": 1594417039.029793,
-                        "dur": "1.9424750804901123",
-                        "saddr": "10.7.10.101",
-                        "sport": "49733",
-                        "daddr": "40.70.224.145",
-                        "dport": "443",
-                        "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 17,
-                        "dpkts": 27,
-                        "sbytes": 25517,
-                        "dbytes": 17247,
-                        "appproto": "ssl",
-                        "ground_truth_label": "Malicious",
-                        "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
-                        },
-                    }
-                )
-                new_flows.append(
-                    {
-                        "starttime": 1382355032.706468,
-                        "dur": "10.896695",
-                        "saddr": "147.32.83.52",
-                        "sport": "47956",
-                        "daddr": "80.242.138.72",
-                        "dport": "80",
-                        "proto": "tcp",
-                        "state": "SF",
-                        "spkts": 1,
-                        "dpkts": 0,
-                        "sbytes": 100,
-                        "dbytes": 67596,
-                        "appproto": "http",
-                        "ground_truth_label": "Benign",
-                        "module_labels": {
-                            "flowalerts-long-connection": "Benign"
-                        },
-                    }
-                )
-
-            # Convert to pandas df
-            df_flows = pd.DataFrame(new_flows)
-
-            # Process features
-            df_flows = self.process_features(df_flows)
-
-            # Update the flow to the processed version
-            self.flows = df_flows
-        except Exception:
-            self.print("Error in process_flows()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def process_flow(self, flow_to_process: dict):
-        """
-        Process one flow. Only used during detection in testing
-        returns the pandas df with the processed flow
-        """
-        try:
-            # Convert the flow to a pandas dataframe
-            raw_flow = pd.DataFrame(flow_to_process, index=[0])
-            dflow = self.process_features(raw_flow)
-            if dflow.empty:
-                return None
-            # Update the flow to the processed version
-            return dflow
-        except Exception:
-            # Stop the timer
-            self.print("Error in process_flow()")
-            self.print(traceback.format_exc(), 0, 1)
-
-    def detect(self, x_flow) -> Optional[numpy.ndarray]:
-        """
-        Detects the given flow with the current model stored
-        and returns the predection array
-        """
-        try:
-            # clean the flow
-            fields_to_drop = [
-                "label",
-                "module_labels",
-                "uid",
-                "history",
-                "dir_",
-                "endtime",
-                "flow_source",
-                "ground_truth_label",
-                "detailed_ground_truth_label",
-            ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
-            # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
-
-            # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
-            for field in fields_to_drop:
-                try:
-                    x_flow = x_flow.drop(field, axis=1)
-                except (KeyError, ValueError):
-                    pass
-            # Scale the flow
-            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
-            pred: numpy.ndarray = self.clf.predict(x_flow)
-            return pred
-        except Exception as e:
-            self.print(
-                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
-            )
-            self.print(traceback.format_exc(), 0, 1)
-
-    def store_model(self):
-        """
-        Store the trained model on disk
-        """
-        self.print("Storing the trained model and scaler on disk.", 0, 2)
-        with open(self.model_path, "wb") as f:
-            data = pickle.dumps(self.clf)
-            f.write(data)
-        with open(self.scaler_path, "wb") as g:
-            data = pickle.dumps(self.scaler)
-            g.write(data)
-
-    def read_model(self):
-        """
-        Read the trained model from disk
-        """
-        try:
-            self.print("Reading the trained model from disk.", 0, 2)
-            with open(self.model_path, "rb") as f:
-                self.clf = pickle.load(f)
-            self.print("Reading the trained scaler from disk.", 0, 2)
-            with open(self.scaler_path, "rb") as g:
-                self.scaler = pickle.load(g)
-        except FileNotFoundError:
-            # If there is no model, create one empty
-            self.print(
-                "There was no model. " "Creating a new empty model.", 0, 2
-            )
-            self.clf = SGDClassifier(
-                warm_start=True, loss="hinge", penalty="l1"
-            )
-        except EOFError:
-            self.print(
-                "Error reading model from disk. "
-                "Creating a new empty model.",
-                0,
-                2,
-            )
-            self.clf = SGDClassifier(
-                warm_start=True, loss="hinge", penalty="l1"
-            )
-
-    def set_evidence_malicious_flow(self, flow: dict, twid: str):
-        confidence: float = 0.1
-        description = (
-            f"Flow with malicious characteristics by ML. Src IP"
-            f" {flow['saddr']}:{flow['sport']} to "
-            f"{flow['daddr']}:{flow['dport']}"
-        )
-        twid_number = int(twid.replace("timewindow", ""))
-        evidence: Evidence = Evidence(
-            evidence_type=EvidenceType.MALICIOUS_FLOW,
-            attacker=Attacker(
-                direction=Direction.SRC,
-                ioc_type=IoCType.IP,
-                value=flow["saddr"],
-            ),
-            victim=Victim(
-                direction=Direction.DST,
-                ioc_type=IoCType.IP,
-                value=flow["daddr"],
-            ),
-            threat_level=ThreatLevel.LOW,
-            confidence=confidence,
-            description=description,
-            profile=ProfileID(ip=flow["saddr"]),
-            timewindow=TimeWindow(twid_number),
-            uid=[flow["uid"]],
-            timestamp=flow["starttime"],
-            method=Method.AI,
-            src_port=flow["sport"],
-            dst_port=flow["dport"],
-        )
-
-        self.db.set_evidence(evidence)
-
-    def shutdown_gracefully(self):
-        # Confirm that the module is done processing
-        if self.mode == "train":
-            self.store_model()
-
-    def pre_main(self):
-        utils.drop_root_privs()
-        # Load the model
-        self.read_model()
-
-    def main(self):
-        if msg := self.get_msg("new_flow"):
-            # When a new flow arrives
-            msg = json.loads(msg["data"])
-            self.twid = msg["twid"]
-            self.profileid = msg["profileid"]
-            self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
-            # flow dict to have them
-            self.flow.update(
-                {
-                    "state": msg["interpreted_state"],
-                    "label": msg["label"],
-                    "module_labels": msg["module_labels"],
-                }
-            )
-
-            if self.mode == "train":
-                # We are training
-
-                # Is the amount in the DB of labels enough to retrain?
-                # Use labeled flows
-                labels = self.db.get_labels()
-                sum_labeled_flows = sum(i[1] for i in labels)
-
-                # The min labels to retrain is the min number of flows 
-                # we should have seen so far in this capture to start training
-                # This is so we dont _start_ training with only 1 flow
-
-                # Once we are over the start minimum, the second condition is 
-                # to force to retrain every a minimum_labels_to_retrain number
-                # of flows. So we dont retrain every 1 flow.
-                if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
-                ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
-                        # So for example we retrain every 50 labels and only when
-                        # we have at least 50 labels
-                        self.print(
-                            f"Training the model with the last group of "
-                            f"flows and labels. Total flows: {sum_labeled_flows}."
-                        )
-                        # Process all flows in the DB and make them ready
-                        # for pandas
-                        self.process_training_flows(self.last_number_of_flows_when_trained)
-                        # Train an algorithm
-                        self.train(sum_labeled_flows)
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
-
-            elif self.mode == "test":
-                # We are testing, which means using the model to detect
-                processed_flow = self.process_flow(self.flow)
-
-                # After processing the flow, it may happen that we
-                # delete icmp/arp/etc so the dataframe can be empty
-                if processed_flow is not None and not processed_flow.empty:
-                    # Predict
-                    pred: numpy.ndarray = self.detect(processed_flow)
-                    if not pred:
-                        # an error occurred
-                        return
-
-                    label = self.flow["label"]
-                    if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
-                        self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            3,
-                        )
-                    if pred[0] == "Malicious":
-                        # Generate an alert
-                        self.set_evidence_malicious_flow(self.flow, self.twid)
-                        self.print(
-                            f"Prediction {pred[0]} for label {label}"
-                            f' flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} -> '
-                            f'{self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            2,
-                        )
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+
+def process_file(file_path):
+    # Initialize the counters for the values
+    FPR_values = []
+    FNR_values = []
+    TNR_values = []
+    TPR_values = []
+    F1_values = []
+    accuracy_values = []
+    precision_values = []
+    MCC_values = []
+    recall_values = []
+    
+    # Read the file and extract the data
+    with open(file_path, 'r') as file:
+        for line in file:
+            if "TP:" in line:
+                # Extract the values from the line
+                parts = line.split(',')
+                TP = int(parts[0].split(':')[1].strip())
+                TN = int(parts[1].split(':')[1].strip())
+                FP = int(parts[2].split(':')[1].strip())
+                FN = int(parts[3].split(':')[1].strip())
+
+                # Calculate metrics
+                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
+                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
+                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
+                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
+                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
+                Recall = TPR  # Recall is the same as TPR
+                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
+                Accuracy = (TP + TN) / (TP + TN + FP + FN)
+                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
+                
+                # Append the values to the respective lists
+                FPR_values.append(FPR)
+                FNR_values.append(FNR)
+                TNR_values.append(TNR)
+                TPR_values.append(TPR)
+                F1_values.append(F1)
+                accuracy_values.append(Accuracy)
+                precision_values.append(Precision)
+                MCC_values.append(MCC)
+                recall_values.append(Recall)
+    
+    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
+
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+    # Separate the values into two groups based on their proximity to 0 or 1
+    close_to_0 = {
+        'FPR': [], 'FNR': []
+    }
+    close_to_1 = {
+        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
+    }
+    
+    # Categorize the metrics into two groups
+    for i in range(len(FPR_values)):
+        close_to_0['FPR'].append(FPR_values[i])
+        close_to_0['FNR'].append(FNR_values[i])
+        
+        close_to_1['TNR'].append(TNR_values[i])
+        close_to_1['TPR'].append(TPR_values[i])
+        close_to_1['F1'].append(F1_values[i])
+        close_to_1['accuracy'].append(accuracy_values[i])
+        close_to_1['precision'].append(precision_values[i])
+        close_to_1['MCC'].append(MCC_values[i])
+        close_to_1['recall'].append(recall_values[i])
+
+    # Plot metrics for values close to 0
+    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
+    
+    # Plot metrics for values close to 1
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+
+    # Print the final values
+    print("\nFinal Metric Values:")
+    print(f"Final FPR: {FPR_values[-1]:.4f}")
+    print(f"Final FNR: {FNR_values[-1]:.4f}")
+    print(f"Final TNR: {TNR_values[-1]:.4f}")
+    print(f"Final TPR: {TPR_values[-1]:.4f}")
+    print(f"Final F1 Score: {F1_values[-1]:.4f}")
+    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
+    print(f"Final Precision: {precision_values[-1]:.4f}")
+    print(f"Final MCC: {MCC_values[-1]:.4f}")
+    print(f"Final Recall: {recall_values[-1]:.4f}")
+
+def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
+    plt.figure(figsize=(12, 8))
+    
+    # Only plot the metrics that exist in the dictionary
+    if 'FPR' in metrics_dict:
+        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
+    if 'FNR' in metrics_dict:
+        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
+    if 'TNR' in metrics_dict:
+        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
+    if 'TPR' in metrics_dict:
+        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
+    if 'F1' in metrics_dict:
+        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
+    if 'accuracy' in metrics_dict:
+        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
+    if 'precision' in metrics_dict:
+        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
+    if 'MCC' in metrics_dict:
+        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
+    if 'recall' in metrics_dict:
+        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
+
+    # Apply log scale by default
+    plt.yscale('log')
+
+    # If the plot is close to 0, set custom ticks
+    if is_close_to_0:
+        # Add more ticks between 0 and 1 (using a logarithmic scale)
+        plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100'])
+
+    plt.xlabel('Index')
+    plt.ylabel('Metric Value')
+    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
+    plt.legend()
+    
+    # Save the plot
+    plt.savefig(output_filename)
+    plt.close()
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <file_path>")
+        sys.exit(1)
+    
+    file_path = sys.argv[1]
+    
+    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+
+if __name__ == "__main__":
+    main()

From 41961660beaf2d95a10273bdebceae4388fafd95 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:20:52 +0000
Subject: [PATCH 429/455] Fix plots

---
 .../plot_testing_performance.py               | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 8f9e12cd86..69b8c96a8c 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['MCC'].append(MCC_values[i])
         close_to_1['recall'].append(recall_values[i])
 
-    # Plot metrics for values close to 0
+    # Plot metrics for values close to 0 (linear scale)
     plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
     
-    # Plot metrics for values close to 1
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
+    # Plot metrics for values close to 1 (log scale)
+    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")
@@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     if 'recall' in metrics_dict:
         plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
 
-    # Apply log scale by default
-    plt.yscale('log')
+    # If the plot is close to 1, apply log scale
+    if not is_close_to_0:
+        plt.yscale('log')
 
-    # If the plot is close to 0, set custom ticks
+    # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series
     if is_close_to_0:
-        # Manually set more Y-ticks for better visibility
-        plt.ylim(0.0001, 1)  # Set Y-axis limits between 0.0001 and 1
-        plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1'])  # Adjust Y-ticks
+        min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR']))
+        max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR']))
+        
+        # Avoid log(0), so set the minimum limit a little higher than zero
+        if min_val == 0:
+            min_val = 1e-4  # Avoid zero values on the logarithmic scale
+
+        plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From dcd73e24811c9ebd2e4aadfea719b851736d72ab Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:23:02 +0000
Subject: [PATCH 430/455] Fix plots

---
 modules/flowmldetection/plot_testing_performance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 69b8c96a8c..de4ada38b3 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
         
         # Avoid log(0), so set the minimum limit a little higher than zero
         if min_val == 0:
-            min_val = 1e-4  # Avoid zero values on the logarithmic scale
+            min_val = 1e-8  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From 499fe19c08b34469a0f7826d614ceababc9d0849 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:25:58 +0000
Subject: [PATCH 431/455] Change plot names

---
 modules/flowmldetection/plot_testing_performance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index de4ada38b3..1b4152c6eb 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0 (linear scale)
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
+    plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")

From 8735210db117c14006ef382bf21051b90cd6c01c Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:26:09 +0000
Subject: [PATCH 432/455] Rename file

---
 .../flowmldetection/plot_train_performance.py | 130 +++++++-----------
 modules/flowmldetection/plot_train_score.py   |  87 ------------
 2 files changed, 53 insertions(+), 164 deletions(-)
 delete mode 100644 modules/flowmldetection/plot_train_score.py

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 304f0f4ead..80e13e9515 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -4,108 +4,84 @@
 import sys
 import argparse
 import os
-import matplotlib.ticker as ticker
 
-def plot_log_data(file_path, experiment_number):
+def plot_log_data(file_path):
     # Read the log data from the file
     with open(file_path, 'r') as file:
         log_data = file.read()
 
-    # Regex pattern for the new log format
-    pattern = (
-        r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: "
-        r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), "
-        r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\."
-    )
+    # Define regex pattern to extract relevant data from each line
+    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
 
     # Parse the log file
     data = re.findall(pattern, log_data)
 
     # Convert data to a DataFrame
-    columns = [
-        "Total labels", "Background", "Benign", "Malicious",
-        "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"
-    ]
-    df = pd.DataFrame(data, columns=columns)
+    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
     df = df.astype({
-        "Total labels": float,
         "Background": int,
         "Benign": int,
         "Malicious": int,
-        "FPR": float,
-        "TNR": float,
-        "TPR": float,
-        "FNR": float,
-        "F1": float,
-        "Precision": float,
-        "Accuracy": float,
-        "MCC": float,
-        "Recall": float,
+        "Total labels": float,
+        "Score": float
     })
 
+    # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
+    plot_file = os.path.join(dir_name, 'performance_metrics_training.png')
+
+    # Plotting the values
+    fig, ax1 = plt.subplots(figsize=(10, 6))
 
-    # --- Plot 1: Number of labels (linear scale, no total labels) ---
-    fig1, ax1 = plt.subplots(figsize=(10, 6))
-    ax1.plot(df.index, df["Background"], label="Background", color='black')
-    ax1.plot(df.index, df["Benign"], label="Benign", color='cyan')
-    ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta')
+    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
+    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
     ax1.set_xlabel('Index')
-    ax1.set_ylabel('Label Counts')
-    ax1.set_title(f'Label Counts - Experiment {experiment_number}')
-    ax1.legend()
-    ax1.yaxis.set_major_locator(ticker.MaxNLocator(70))
-    ax1.xaxis.set_major_locator(ticker.MaxNLocator(50))
-    plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png'))
-
-    # --- Plot 2: FNR and FPR (log scale) ---
-    fig2, ax2 = plt.subplots(figsize=(10, 6))
-    ax2.plot(df.index, df["FNR"], label="FNR", color='red')
-    ax2.plot(df.index, df["FPR"], label="FPR", color='blue')
-    ax2.set_xlabel('Index')
-    ax2.set_ylabel('Rate')
-    ax2.set_yscale('log')
-    ax2.set_title(f'FNR and FPR - Experiment {experiment_number}')
-    ax2.legend()
-    ax2.yaxis.set_major_locator(ticker.MaxNLocator(100))
-    ax2.xaxis.set_major_locator(ticker.MaxNLocator(50))
-    plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png'))
-
-    # --- Plot 3: Other metrics (log scale) ---
-    fig3, ax3 = plt.subplots(figsize=(12, 7))
-    metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"]
-    colors_rest = [
-        'tab:blue', 'tab:green', 'tab:purple', 'tab:brown',
-        'tab:gray', 'tab:pink', 'tab:olive'
-    ]
-    for metric, color in zip(metrics_rest, colors_rest):
-        ax3.plot(df.index, df[metric], label=metric, color=color)
-    ax3.set_xlabel('Index')
-    ax3.set_ylabel('Metric Value')
-    ax3.set_yscale('log')
-    ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}')
-    ax3.legend()
-    ax3.yaxis.set_major_locator(ticker.MaxNLocator(50))
-    ax3.xaxis.set_major_locator(ticker.MaxNLocator(50))
-    plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png'))
+    ax1.set_ylabel('Score', color='tab:blue')
+    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
+    ax1.tick_params(axis='y', labelcolor='tab:blue')
 
-    plt.show()
+    # Create the second y-axis for the Background, Benign, Malicious
+    ax2 = ax1.twinx()
+    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
+    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
+    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
+    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
+    
+    # Set appropriate scale for right y-axis based on the data
+    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
+    ax2.tick_params(axis='y', labelcolor='tab:red')
+
+    # Annotating Total labels as text on the plot
+    for i, value in enumerate(df["Total labels"]):
+        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
+
+    # Adding title and legend
+    plt.title('Training performance')
+    fig.tight_layout()
 
-    # --- Print final values in terminal ---
-    print("\nFinal values at last training step:")
-    for col in ["Total labels", "Background", "Benign", "Malicious",
-                "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]:
-        print(f"{col}: {df[col].iloc[-1]}")
+    # Move both legends further to the right
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
+
+    # Increase right margin for better readability of legend
+    plt.subplots_adjust(right=0.7)
+
+    # Save plot to the same folder as the log file
+    plt.savefig(plot_file)
+
+    # Display the plot
+    plt.show()
 
 def main():
+    # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
-    parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file")
-    parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename")
+    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
+    
+    # Handle -h / --help
     args = parser.parse_args()
-    plot_log_data(args.file, args.experiment)
+
+    # Call the function to process the log file
+    plot_log_data(args.log_file)
 
 if __name__ == "__main__":
     main()
diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py
deleted file mode 100644
index 8437e968ac..0000000000
--- a/modules/flowmldetection/plot_train_score.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import pandas as pd
-import matplotlib.pyplot as plt
-import re
-import sys
-import argparse
-import os
-
-def plot_log_data(file_path):
-    # Read the log data from the file
-    with open(file_path, 'r') as file:
-        log_data = file.read()
-
-    # Define regex pattern to extract relevant data from each line
-    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
-
-    # Parse the log file
-    data = re.findall(pattern, log_data)
-
-    # Convert data to a DataFrame
-    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
-    df = df.astype({
-        "Background": int,
-        "Benign": int,
-        "Malicious": int,
-        "Total labels": float,
-        "Score": float
-    })
-
-    # Get the directory of the log file to store the plot in the same folder
-    dir_name = os.path.dirname(file_path)
-    plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png')
-
-    # Plotting the values
-    fig, ax1 = plt.subplots(figsize=(10, 6))
-
-    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
-    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
-    ax1.set_xlabel('Index')
-    ax1.set_ylabel('Score', color='tab:blue')
-    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
-    ax1.tick_params(axis='y', labelcolor='tab:blue')
-
-    # Create the second y-axis for the Background, Benign, Malicious
-    ax2 = ax1.twinx()
-    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
-    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
-    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
-    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
-    
-    # Set appropriate scale for right y-axis based on the data
-    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
-    ax2.tick_params(axis='y', labelcolor='tab:red')
-
-    # Annotating Total labels as text on the plot
-    for i, value in enumerate(df["Total labels"]):
-        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
-
-    # Adding title and legend
-    plt.title('Training performance')
-    fig.tight_layout()
-
-    # Move both legends further to the right
-    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
-    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
-
-    # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.7)
-
-    # Save plot to the same folder as the log file
-    plt.savefig(plot_file)
-
-    # Display the plot
-    plt.show()
-
-def main():
-    # Parse command-line arguments
-    parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
-    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
-    
-    # Handle -h / --help
-    args = parser.parse_args()
-
-    # Call the function to process the log file
-    plot_log_data(args.log_file)
-
-if __name__ == "__main__":
-    main()

From a454bd7b3fca49d80a02d05783b2637b57101d9c Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:31:32 +0000
Subject: [PATCH 433/455] Recover good flowmldetection deleted by mistake

---
 modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++-----
 1 file changed, 566 insertions(+), 143 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 37f0761109..5e4e9aa462 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -1,143 +1,566 @@
-import matplotlib.pyplot as plt
-import sys
-import numpy as np
-
-def process_file(file_path):
-    # Initialize the counters for the values
-    FPR_values = []
-    FNR_values = []
-    TNR_values = []
-    TPR_values = []
-    F1_values = []
-    accuracy_values = []
-    precision_values = []
-    MCC_values = []
-    recall_values = []
-    
-    # Read the file and extract the data
-    with open(file_path, 'r') as file:
-        for line in file:
-            if "TP:" in line:
-                # Extract the values from the line
-                parts = line.split(',')
-                TP = int(parts[0].split(':')[1].strip())
-                TN = int(parts[1].split(':')[1].strip())
-                FP = int(parts[2].split(':')[1].strip())
-                FN = int(parts[3].split(':')[1].strip())
-
-                # Calculate metrics
-                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
-                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
-                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
-                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
-                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
-                Recall = TPR  # Recall is the same as TPR
-                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
-                Accuracy = (TP + TN) / (TP + TN + FP + FN)
-                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
-                
-                # Append the values to the respective lists
-                FPR_values.append(FPR)
-                FNR_values.append(FNR)
-                TNR_values.append(TNR)
-                TPR_values.append(TPR)
-                F1_values.append(F1)
-                accuracy_values.append(Accuracy)
-                precision_values.append(Precision)
-                MCC_values.append(MCC)
-                recall_values.append(Recall)
-    
-    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
-
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
-    # Separate the values into two groups based on their proximity to 0 or 1
-    close_to_0 = {
-        'FPR': [], 'FNR': []
-    }
-    close_to_1 = {
-        'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': []
-    }
-    
-    # Categorize the metrics into two groups
-    for i in range(len(FPR_values)):
-        close_to_0['FPR'].append(FPR_values[i])
-        close_to_0['FNR'].append(FNR_values[i])
-        
-        close_to_1['TNR'].append(TNR_values[i])
-        close_to_1['TPR'].append(TPR_values[i])
-        close_to_1['F1'].append(F1_values[i])
-        close_to_1['accuracy'].append(accuracy_values[i])
-        close_to_1['precision'].append(precision_values[i])
-        close_to_1['MCC'].append(MCC_values[i])
-        close_to_1['recall'].append(recall_values[i])
-
-    # Plot metrics for values close to 0
-    plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True)
-    
-    # Plot metrics for values close to 1
-    plot_single_group(close_to_1, 'metrics_plot_close_to_1.png')
-
-    # Print the final values
-    print("\nFinal Metric Values:")
-    print(f"Final FPR: {FPR_values[-1]:.4f}")
-    print(f"Final FNR: {FNR_values[-1]:.4f}")
-    print(f"Final TNR: {TNR_values[-1]:.4f}")
-    print(f"Final TPR: {TPR_values[-1]:.4f}")
-    print(f"Final F1 Score: {F1_values[-1]:.4f}")
-    print(f"Final Accuracy: {accuracy_values[-1]:.4f}")
-    print(f"Final Precision: {precision_values[-1]:.4f}")
-    print(f"Final MCC: {MCC_values[-1]:.4f}")
-    print(f"Final Recall: {recall_values[-1]:.4f}")
-
-def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
-    plt.figure(figsize=(12, 8))
-    
-    # Only plot the metrics that exist in the dictionary
-    if 'FPR' in metrics_dict:
-        plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o')
-    if 'FNR' in metrics_dict:
-        plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o')
-    if 'TNR' in metrics_dict:
-        plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o')
-    if 'TPR' in metrics_dict:
-        plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o')
-    if 'F1' in metrics_dict:
-        plt.plot(metrics_dict['F1'], label='F1 Score', marker='o')
-    if 'accuracy' in metrics_dict:
-        plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o')
-    if 'precision' in metrics_dict:
-        plt.plot(metrics_dict['precision'], label='Precision', marker='o')
-    if 'MCC' in metrics_dict:
-        plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o')
-    if 'recall' in metrics_dict:
-        plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o')
-
-    # Apply log scale by default
-    plt.yscale('log')
-
-    # If the plot is close to 0, set custom ticks
-    if is_close_to_0:
-        # Add more ticks between 0 and 1 (using a logarithmic scale)
-        plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100'])
-
-    plt.xlabel('Index')
-    plt.ylabel('Metric Value')
-    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
-    plt.legend()
-    
-    # Save the plot
-    plt.savefig(output_filename)
-    plt.close()
-
-def main():
-    if len(sys.argv) != 2:
-        print("Usage: python script.py <file_path>")
-        sys.exit(1)
-    
-    file_path = sys.argv[1]
-    
-    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
-
-if __name__ == "__main__":
-    main()
+# SPDX-FileCopyrightText: 2021 Sebastian Garcia <sebastian.garcia@agents.fel.cvut.cz>
+from typing import Optional
+
+# SPDX-License-Identifier: GPL-2.0-only
+import numpy
+from sklearn.linear_model import SGDClassifier
+from sklearn.preprocessing import StandardScaler
+import pickle
+import pandas as pd
+import json
+import traceback
+import warnings
+import os
+
+from slips_files.common.parsers.config_parser import ConfigParser
+from slips_files.common.slips_utils import utils
+from slips_files.common.abstracts.module import IModule
+from slips_files.core.structures.evidence import (
+    Evidence,
+    ProfileID,
+    TimeWindow,
+    Attacker,
+    ThreatLevel,
+    EvidenceType,
+    IoCType,
+    Direction,
+    Victim,
+    Method,
+)
+
+# This horrible hack is only to stop sklearn from printing those warnings
+def warn(*args, **kwargs):
+    pass
+
+
+warnings.warn = warn
+
+
+class FlowMLDetection(IModule):
+    # Name: short name of the module. Do not use spaces
+    name = "Flow ML Detection"
+    description = (
+        "Train or test a Machine Learning model to detect malicious flows"
+    )
+    authors = ["Sebastian Garcia"]
+
+    def init(self):
+        # Subscribe to the channel
+        self.c1 = self.db.subscribe("new_flow")
+        self.channels = {"new_flow": self.c1}
+        self.fieldseparator = self.db.get_field_separator()
+        # Set the output queue of our database instance
+        # Read the configuration
+        self.read_configuration()
+        # Minum amount of new labels needed to start the train
+        self.minimum_labels_to_start_train = 50
+        # Minum amount of new labels needed to retrain
+        self.minimum_labels_to_retrain = 50
+        # The number of flows when last trained. Used internally only to know
+        # when to retrain
+        self.last_number_of_flows_when_trained = 0
+        # The scaler trained during training and to use during testing
+        self.scaler = StandardScaler()
+        self.model_path = "./modules/flowmldetection/model.bin"
+        self.scaler_path = "./modules/flowmldetection/scaler.bin"
+
+        # Initialize the training log file
+        self.training_log_path = "./modules/flowmldetection/training.log"
+        with open(self.training_log_path, "w") as log_file:
+            log_file.write("Training Log Initialized\n")
+
+    def read_configuration(self):
+        conf = ConfigParser()
+        self.mode = conf.get_ml_mode()
+        # This is the global label in the configuration,
+        # in case the flows do not have a label themselves
+        self.label = conf.label()
+
+    def write_to_training_log(self, message: str):
+        """
+        Write a message to the training log file.
+        """
+        try:
+            with open(self.training_log_path, "a") as log_file:
+                log_file.write(message + "\n")
+        except Exception as e:
+            self.print(f"Error writing to training log: {e}", 0, 1)
+
+    def train(self, sum_labeled_flows):
+        """
+        Train a model based on the flows we receive and the labels
+        """
+        try:
+            # Create X_flow with the current flows minus the label
+            X_flow = self.flows.drop("ground_truth_label", axis=1)
+            # Drop the detailed labels
+            X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
+            # Drop the module_labels
+            X_flow = X_flow.drop("module_labels", axis=1)
+            # Create y_flow with the label
+            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
+
+            # Normalize this batch of data so far. This can get progressivle slow
+            X_flow = self.scaler.fit_transform(X_flow)
+
+            # Count the number of labels of each type in this epoc
+            epoch_label_counts = {
+                "Background": (y_flow == "Background").sum(),
+                "Malicious": (y_flow == "Malicious").sum(),
+                "Benign": (y_flow == "Benign").sum(),
+            }
+
+            # Train
+            try:
+                # Online incremental learning
+                self.clf.partial_fit(
+                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
+                )
+            except Exception:
+                self.print("Error while calling clf.train()")
+                self.print(traceback.format_exc(), 0, 1)
+
+            # See score so far in training
+            score = self.clf.score(X_flow, y_flow)
+
+            #self.print(f"	Training Score: {score}", 1, 0)
+            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
+
+            # Store the models on disk
+            self.store_model()
+
+            # Log training information
+            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
+            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
+        except Exception:
+            self.print("Error in train().", 0, 1)
+            self.print(traceback.format_exc(), 0, 1)
+            self.write_to_training_log("Error occurred during training.")
+
+    def process_features(self, dataset):
+        """
+        Discards some features of the dataset and can create new.
+        Clean the dataset
+        """
+        try:
+            # Discard some type of flows that dont have ports
+            to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""]
+            for proto in to_discard:
+                dataset = dataset[dataset.proto != proto]
+
+            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            if dataset.empty:
+                # DataFrame is empty now, so return empty
+                return dataset
+
+            # For now, discard these
+            to_drop = [
+                "appproto",
+                "daddr",
+                "saddr",
+                "starttime",
+                "type_",
+                "smac",
+                "dmac",
+                "history",
+                "uid",
+                "dir_",
+                "endtime",
+                "flow_source",
+            ]
+            for field in to_drop:
+                try:
+                    dataset = dataset.drop(field, axis=1)
+                except (ValueError, KeyError):
+                    pass
+
+            # When flows are read from Slips sqlite,
+            # the state is not transformed to 'Established' or
+            # 'Not Established', it is still 'S0' and others
+            # So transform here
+            dataset["state"] = dataset.apply(
+                lambda row: self.db.get_final_state_from_flags(
+                    row["state"], (row["spkts"] + row["dpkts"])
+                ),
+                axis=1,
+            )
+
+            # Convert state to categorical
+            dataset.state = dataset.state.str.replace(
+                r"(^.*Not Established.*$)", "0", regex=True
+            )
+            dataset.state = dataset.state.str.replace(
+                r"(^.*Established.*$)", "1", regex=True
+            )
+
+            # Convert categories to floats
+            dataset.state = dataset.state.astype("float64")
+
+            # Convert proto to categorical. For now we only have few states, so we can hardcode...
+            # We dont use the data to create categories because in testing mode
+            # we dont see all the protocols
+            # Also we dont store the Categorizer because the user can retrain
+            # with its own data.
+            dataset.proto = dataset.proto.str.lower()
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*tcp.*$)", "0", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*udp.*$)", "1", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*icmp.*$)", "2", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*icmp-ipv6.*$)", "3", regex=True
+            )
+            dataset.proto = dataset.proto.str.replace(
+                r"(^.*arp.*$)", "4", regex=True
+            )
+
+            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
+
+            fields_to_convert_to_float = [
+                dataset.proto,
+                dataset.dport,
+                dataset.sport,
+                dataset.dur,
+                dataset.pkts,
+                dataset.spkts,
+                dataset.allbytes,
+                dataset.sbytes,
+                dataset.state,
+            ]
+            for field in fields_to_convert_to_float:
+                try:
+                    field = field.astype("float64")
+                except (ValueError, AttributeError):
+                    pass
+
+            return dataset
+        except Exception:
+            # Stop the timer
+            self.print("Error in process_features()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def process_training_flows(self, last_number_of_flows_when_trained):
+        """
+        Process only the new flows in the DB since the last training.
+        Store the pandas df in self.flows
+        """
+        try:
+            # Ensure the index is an integer
+            if last_number_of_flows_when_trained is None:
+                last_number_of_flows_when_trained = 0
+            else:
+                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+
+            # We get all the flows so far
+            flows = self.db.get_all_flows()
+            # Only process new flows since last training
+            new_flows = flows[last_number_of_flows_when_trained:]
+
+            # Check how many **different** labels are in the DB
+            labels = self.db.get_labels()
+            if len(labels) == 1:
+                # Insert fake flows for both classes if needed
+                new_flows.append(
+                    {
+                        "starttime": 1594417039.029793,
+                        "dur": "1.9424750804901123",
+                        "saddr": "10.7.10.101",
+                        "sport": "49733",
+                        "daddr": "40.70.224.145",
+                        "dport": "443",
+                        "proto": "tcp",
+                        "state": "SF",
+                        "spkts": 17,
+                        "dpkts": 27,
+                        "sbytes": 25517,
+                        "dbytes": 17247,
+                        "appproto": "ssl",
+                        "ground_truth_label": "Malicious",
+                        "module_labels": {
+                            "flowalerts-long-connection": "Malicious"
+                        },
+                    }
+                )
+                new_flows.append(
+                    {
+                        "starttime": 1382355032.706468,
+                        "dur": "10.896695",
+                        "saddr": "147.32.83.52",
+                        "sport": "47956",
+                        "daddr": "80.242.138.72",
+                        "dport": "80",
+                        "proto": "tcp",
+                        "state": "SF",
+                        "spkts": 1,
+                        "dpkts": 0,
+                        "sbytes": 100,
+                        "dbytes": 67596,
+                        "appproto": "http",
+                        "ground_truth_label": "Benign",
+                        "module_labels": {
+                            "flowalerts-long-connection": "Benign"
+                        },
+                    }
+                )
+
+            # Convert to pandas df
+            df_flows = pd.DataFrame(new_flows)
+
+            # Process features
+            df_flows = self.process_features(df_flows)
+
+            # Update the flow to the processed version
+            self.flows = df_flows
+        except Exception:
+            self.print("Error in process_flows()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def process_flow(self, flow_to_process: dict):
+        """
+        Process one flow. Only used during detection in testing
+        returns the pandas df with the processed flow
+        """
+        try:
+            # Convert the flow to a pandas dataframe
+            raw_flow = pd.DataFrame(flow_to_process, index=[0])
+            dflow = self.process_features(raw_flow)
+            if dflow.empty:
+                return None
+            # Update the flow to the processed version
+            return dflow
+        except Exception:
+            # Stop the timer
+            self.print("Error in process_flow()")
+            self.print(traceback.format_exc(), 0, 1)
+
+    def detect(self, x_flow) -> Optional[numpy.ndarray]:
+        """
+        Detects the given flow with the current model stored
+        and returns the predection array
+        """
+        try:
+            # clean the flow
+            fields_to_drop = [
+                "label",
+                "module_labels",
+                "uid",
+                "history",
+                "dir_",
+                "endtime",
+                "flow_source",
+                "ground_truth_label",
+                "detailed_ground_truth_label",
+            ]
+            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # Error
+            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
+            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
+            Feature names unseen at fit time:                                                                                                                                                                                                                                              
+            - bytes     
+            '''
+
+            # IF we delete here the filed bytes the error is
+            # [Flow ML Detection] Error in detect() while processing 
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
+            # The feature names should match those that were passed during fit.                                                                                                                                                                                
+            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
+                                                                                                                                                                                                                                                 
+            for field in fields_to_drop:
+                try:
+                    x_flow = x_flow.drop(field, axis=1)
+                except (KeyError, ValueError):
+                    pass
+            # Scale the flow
+            x_flow: numpy.ndarray = self.scaler.transform(x_flow)
+            pred: numpy.ndarray = self.clf.predict(x_flow)
+            return pred
+        except Exception as e:
+            self.print(
+                f"Error in detect() while processing " f"\n{x_flow}\n{e}"
+            )
+            self.print(traceback.format_exc(), 0, 1)
+
+    def store_model(self):
+        """
+        Store the trained model on disk
+        """
+        self.print("Storing the trained model and scaler on disk.", 0, 2)
+        with open(self.model_path, "wb") as f:
+            data = pickle.dumps(self.clf)
+            f.write(data)
+        with open(self.scaler_path, "wb") as g:
+            data = pickle.dumps(self.scaler)
+            g.write(data)
+
+    def read_model(self):
+        """
+        Read the trained model from disk
+        """
+        try:
+            self.print("Reading the trained model from disk.", 0, 2)
+            with open(self.model_path, "rb") as f:
+                self.clf = pickle.load(f)
+            self.print("Reading the trained scaler from disk.", 0, 2)
+            with open(self.scaler_path, "rb") as g:
+                self.scaler = pickle.load(g)
+        except FileNotFoundError:
+            # If there is no model, create one empty
+            self.print(
+                "There was no model. " "Creating a new empty model.", 0, 2
+            )
+            self.clf = SGDClassifier(
+                warm_start=True, loss="hinge", penalty="l1"
+            )
+        except EOFError:
+            self.print(
+                "Error reading model from disk. "
+                "Creating a new empty model.",
+                0,
+                2,
+            )
+            self.clf = SGDClassifier(
+                warm_start=True, loss="hinge", penalty="l1"
+            )
+
+    def set_evidence_malicious_flow(self, flow: dict, twid: str):
+        confidence: float = 0.1
+        description = (
+            f"Flow with malicious characteristics by ML. Src IP"
+            f" {flow['saddr']}:{flow['sport']} to "
+            f"{flow['daddr']}:{flow['dport']}"
+        )
+        twid_number = int(twid.replace("timewindow", ""))
+        evidence: Evidence = Evidence(
+            evidence_type=EvidenceType.MALICIOUS_FLOW,
+            attacker=Attacker(
+                direction=Direction.SRC,
+                ioc_type=IoCType.IP,
+                value=flow["saddr"],
+            ),
+            victim=Victim(
+                direction=Direction.DST,
+                ioc_type=IoCType.IP,
+                value=flow["daddr"],
+            ),
+            threat_level=ThreatLevel.LOW,
+            confidence=confidence,
+            description=description,
+            profile=ProfileID(ip=flow["saddr"]),
+            timewindow=TimeWindow(twid_number),
+            uid=[flow["uid"]],
+            timestamp=flow["starttime"],
+            method=Method.AI,
+            src_port=flow["sport"],
+            dst_port=flow["dport"],
+        )
+
+        self.db.set_evidence(evidence)
+
+    def shutdown_gracefully(self):
+        # Confirm that the module is done processing
+        if self.mode == "train":
+            self.store_model()
+
+    def pre_main(self):
+        utils.drop_root_privs()
+        # Load the model
+        self.read_model()
+
+    def main(self):
+        if msg := self.get_msg("new_flow"):
+            # When a new flow arrives
+            msg = json.loads(msg["data"])
+            self.twid = msg["twid"]
+            self.profileid = msg["profileid"]
+            self.flow = msg["flow"]
+            # These following extra fields are expected in testing. update the original
+            # flow dict to have them
+            self.flow.update(
+                {
+                    "state": msg["interpreted_state"],
+                    "label": msg["label"],
+                    "module_labels": msg["module_labels"],
+                }
+            )
+
+            if self.mode == "train":
+                # We are training
+
+                # Is the amount in the DB of labels enough to retrain?
+                # Use labeled flows
+                labels = self.db.get_labels()
+                sum_labeled_flows = sum(i[1] for i in labels)
+
+                # The min labels to retrain is the min number of flows 
+                # we should have seen so far in this capture to start training
+                # This is so we dont _start_ training with only 1 flow
+
+                # Once we are over the start minimum, the second condition is 
+                # to force to retrain every a minimum_labels_to_retrain number
+                # of flows. So we dont retrain every 1 flow.
+                if (
+                    sum_labeled_flows >= self.minimum_labels_to_start_train
+                ):
+                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                        # So for example we retrain every 50 labels and only when
+                        # we have at least 50 labels
+                        self.print(
+                            f"Training the model with the last group of "
+                            f"flows and labels. Total flows: {sum_labeled_flows}."
+                        )
+                        # Process all flows in the DB and make them ready
+                        # for pandas
+                        self.process_training_flows(self.last_number_of_flows_when_trained)
+                        # Train an algorithm
+                        self.train(sum_labeled_flows)
+                        self.last_number_of_flows_when_trained = sum_labeled_flows
+
+            elif self.mode == "test":
+                # We are testing, which means using the model to detect
+                processed_flow = self.process_flow(self.flow)
+
+                # After processing the flow, it may happen that we
+                # delete icmp/arp/etc so the dataframe can be empty
+                if processed_flow is not None and not processed_flow.empty:
+                    # Predict
+                    pred: numpy.ndarray = self.detect(processed_flow)
+                    if not pred:
+                        # an error occurred
+                        return
+
+                    label = self.flow["label"]
+                    if label and label != "unknown" and label != pred[0]:
+                        # If the user specified a label in test mode,
+                        # and the label is diff from the prediction,
+                        # print in debug mode
+                        self.print(
+                            f"Predicted {pred[0]} for ground-truth label"
+                            f' {label}. Flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} ->'
+                            f' {self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
+                            0,
+                            3,
+                        )
+                    if pred[0] == "Malicious":
+                        # Generate an alert
+                        self.set_evidence_malicious_flow(self.flow, self.twid)
+                        self.print(
+                            f"Prediction {pred[0]} for label {label}"
+                            f' flow {self.flow["saddr"]}:'
+                            f'{self.flow["sport"]} -> '
+                            f'{self.flow["daddr"]}:'
+                            f'{self.flow["dport"]}/'
+                            f'{self.flow["proto"]}',
+                            0,
+                            2,
+                        )
\ No newline at end of file

From 3da80024964515c0df1aee115d68a9c73cba1c7e Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:31:43 +0000
Subject: [PATCH 434/455] Fix plot test

---
 modules/flowmldetection/plot_testing_performance.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 1b4152c6eb..977a68b2d5 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False)
 
     # Print the final values
     print("\nFinal Metric Values:")
@@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
         
         # Avoid log(0), so set the minimum limit a little higher than zero
         if min_val == 0:
-            min_val = 1e-8  # Avoid zero values on the logarithmic scale
+            min_val = 1e-4  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
 
     plt.xlabel('Index')
     plt.ylabel('Metric Value')

From d4e2666af9c2454ebbffd2dbc7f338c99bfc63a5 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 18:50:33 +0000
Subject: [PATCH 435/455] Add testing code to evaluate performance. It is
 optional with a varible

---
 modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++-------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 5e4e9aa462..b17a1baaf0 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -526,36 +526,21 @@ def main(self):
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
                 processed_flow = self.process_flow(self.flow)
-
                 # After processing the flow, it may happen that we
                 # delete icmp/arp/etc so the dataframe can be empty
                 if processed_flow is not None and not processed_flow.empty:
+                    original_label = processed_flow["ground_truth_label"].iloc[0]
                     # Predict
                     pred: numpy.ndarray = self.detect(processed_flow)
                     if not pred:
                         # an error occurred
                         return
 
-                    label = self.flow["label"]
-                    if label and label != "unknown" and label != pred[0]:
-                        # If the user specified a label in test mode,
-                        # and the label is diff from the prediction,
-                        # print in debug mode
-                        self.print(
-                            f"Predicted {pred[0]} for ground-truth label"
-                            f' {label}. Flow {self.flow["saddr"]}:'
-                            f'{self.flow["sport"]} ->'
-                            f' {self.flow["daddr"]}:'
-                            f'{self.flow["dport"]}/'
-                            f'{self.flow["proto"]}',
-                            0,
-                            3,
-                        )
                     if pred[0] == "Malicious":
                         # Generate an alert
                         self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
-                            f"Prediction {pred[0]} for label {label}"
+                            f"Prediction {pred[0]} for label {original_label}"
                             f' flow {self.flow["saddr"]}:'
                             f'{self.flow["sport"]} -> '
                             f'{self.flow["daddr"]}:'
@@ -563,4 +548,43 @@ def main(self):
                             f'{self.flow["proto"]}',
                             0,
                             2,
-                        )
\ No newline at end of file
+                        )
+
+                    # So you can disable this code easily. Since it is used only for evaluating a testing
+                    log_testing_data = True
+                    if log_testing_data:
+                        # Initialize counters if not already done
+                        if not hasattr(self, 'tp'):
+                            self.tp = 0
+                        if not hasattr(self, 'tn'):
+                            self.tn = 0
+                        if not hasattr(self, 'fp'):
+                            self.fp = 0
+                        if not hasattr(self, 'fn'):
+                            self.fn = 0
+
+
+                        # Update counters based on predictions and labels
+                        if pred[0] == "Malicious" and original_label == "Malicious":
+                            self.tp += 1
+                        elif pred[0] == "Benign" and original_label == "Benign":
+                            self.tn += 1
+                        elif pred[0] == "Malicious" and original_label == "Benign":
+                            self.fp += 1
+                        elif pred[0] == "Benign" and original_label == "Malicious":
+                            self.fn += 1
+
+                        testing_log_path = "./modules/flowmldetection/testing_performance.log"
+                        try:
+                            with open(testing_log_path, "a") as log_file:
+                                log_file.write("Testing Performance Log Initialized\n")
+                                # Log the testing performance metrics
+                                log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n")
+
+                                # Log the original flow for false positives and false negatives
+                                if pred[0] == "Malicious" and original_label == "Benign":
+                                    log_file.write(f"False Positive Flow: {self.flow}\n")
+                                elif pred[0] == "Benign" and original_label == "Malicious":
+                                    log_file.write(f"False Negative Flow: {self.flow}\n")
+                        except Exception as e:
+                            self.print(f"Error initializing testing performance log: {e}", 0, 1)
\ No newline at end of file

From 5d2d84a80cf2a77f160bc5cb16a46ae9700ff9a0 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 19:04:00 +0000
Subject: [PATCH 436/455] Fix plots

---
 .../plot_testing_performance.py               | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 977a68b2d5..6865415cdf 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 import sys
 import numpy as np
+import argparse
 
 def process_file(file_path):
     # Initialize the counters for the values
@@ -49,7 +50,7 @@ def process_file(file_path):
     
     return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
 
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values):
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number):
     # Separate the values into two groups based on their proximity to 0 or 1
     close_to_0 = {
         'FPR': [], 'FNR': []
@@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
         close_to_1['recall'].append(recall_values[i])
 
     # Plot metrics for values close to 0 (linear scale)
-    plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True)
+    plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True)
     
     # Plot metrics for values close to 1 (log scale)
-    plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False)
+    plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False)
 
     # Print the final values
-    print("\nFinal Metric Values:")
+    print("\nFinal Metric Values for Experiment", experiment_number)
     print(f"Final FPR: {FPR_values[-1]:.4f}")
     print(f"Final FNR: {FNR_values[-1]:.4f}")
     print(f"Final TNR: {TNR_values[-1]:.4f}")
@@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     print(f"Final MCC: {MCC_values[-1]:.4f}")
     print(f"Final Recall: {recall_values[-1]:.4f}")
 
-def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
+def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False):
     plt.figure(figsize=(12, 8))
     
     # Only plot the metrics that exist in the dictionary
@@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
             min_val = 1e-4  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
+        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
 
+    # Add the experiment number to the plot title
     plt.xlabel('Index')
     plt.ylabel('Metric Value')
-    plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})')
+    plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time')
     plt.legend()
     
     # Save the plot
@@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False):
     plt.close()
 
 def main():
-    if len(sys.argv) != 2:
-        print("Usage: python script.py <file_path>")
-        sys.exit(1)
+    # Set up argument parsing
+    parser = argparse.ArgumentParser(description='Plot testing performance metrics.')
+    parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file')
+    parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number')
+
+    args = parser.parse_args()
     
-    file_path = sys.argv[1]
+    file_path = args.file
+    experiment_number = args.experiment
     
     FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number)
 
 if __name__ == "__main__":
     main()

From e400c0354f3c7ce82739100a48e394c026b02514 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 19:14:51 +0000
Subject: [PATCH 437/455] Fix train plot

---
 .../flowmldetection/plot_train_performance.py | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 80e13e9515..244df13d28 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -5,7 +5,7 @@
 import argparse
 import os
 
-def plot_log_data(file_path):
+def plot_log_data(file_path, experiment_number):
     # Read the log data from the file
     with open(file_path, 'r') as file:
         log_data = file.read()
@@ -28,7 +28,8 @@ def plot_log_data(file_path):
 
     # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    plot_file = os.path.join(dir_name, 'performance_metrics_training.png')
+    # Append experiment number to the filename
+    plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png')
 
     # Plotting the values
     fig, ax1 = plt.subplots(figsize=(10, 6))
@@ -55,18 +56,18 @@ def plot_log_data(file_path):
     for i, value in enumerate(df["Total labels"]):
         ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
 
-    # Adding title and legend
-    plt.title('Training performance')
+    # Adding title and legend with experiment number in title
+    plt.title(f'Training performance - Experiment {experiment_number}')
     fig.tight_layout()
 
     # Move both legends further to the right
-    ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1)
-    ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1)
+    ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1)
+    ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1)
 
     # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.7)
+    plt.subplots_adjust(right=0.75)
 
-    # Save plot to the same folder as the log file
+    # Save plot to the same folder as the log file with experiment number in filename
     plt.savefig(plot_file)
 
     # Display the plot
@@ -75,13 +76,14 @@ def plot_log_data(file_path):
 def main():
     # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
-    parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file")
+    parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file")
+    parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename")
     
     # Handle -h / --help
     args = parser.parse_args()
 
     # Call the function to process the log file
-    plot_log_data(args.log_file)
+    plot_log_data(args.file, args.experiment)
 
 if __name__ == "__main__":
     main()

From 8983a7f529e987e11dc915513179f0b1620e3f64 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 21:14:48 +0000
Subject: [PATCH 438/455] Fix plots

---
 .../flowmldetection/plot_train_performance.py | 122 ++++++++++--------
 1 file changed, 71 insertions(+), 51 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 244df13d28..5212dfeeaf 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -4,85 +4,105 @@
 import sys
 import argparse
 import os
+import matplotlib.ticker as ticker
 
 def plot_log_data(file_path, experiment_number):
     # Read the log data from the file
     with open(file_path, 'r') as file:
         log_data = file.read()
 
-    # Define regex pattern to extract relevant data from each line
-    pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)"
+    # Regex pattern for the new log format
+    pattern = (
+        r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: "
+        r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), "
+        r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\."
+    )
 
     # Parse the log file
     data = re.findall(pattern, log_data)
 
     # Convert data to a DataFrame
-    df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"])
+    columns = [
+        "Total labels", "Background", "Benign", "Malicious",
+        "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"
+    ]
+    df = pd.DataFrame(data, columns=columns)
     df = df.astype({
+        "Total labels": float,
         "Background": int,
         "Benign": int,
         "Malicious": int,
-        "Total labels": float,
-        "Score": float
+        "FPR": float,
+        "TNR": float,
+        "TPR": float,
+        "FNR": float,
+        "F1": float,
+        "Precision": float,
+        "Accuracy": float,
+        "MCC": float,
+        "Recall": float,
     })
 
-    # Get the directory of the log file to store the plot in the same folder
     dir_name = os.path.dirname(file_path)
-    # Append experiment number to the filename
-    plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png')
-
-    # Plotting the values
-    fig, ax1 = plt.subplots(figsize=(10, 6))
 
-    # Plotting Score on the left y-axis (with proper scaling from 0 to 1)
-    ax1.plot(df.index, df["Score"], label="Score", color='tab:blue')
+    # --- Plot 1: Number of labels (linear scale, no total labels) ---
+    fig1, ax1 = plt.subplots(figsize=(10, 6))
+    ax1.plot(df.index, df["Background"], label="Background", color='black')
+    ax1.plot(df.index, df["Benign"], label="Benign", color='cyan')
+    ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta')
     ax1.set_xlabel('Index')
-    ax1.set_ylabel('Score', color='tab:blue')
-    ax1.set_ylim(0, 1)  # Set y-axis for Score from 0 to 1
-    ax1.tick_params(axis='y', labelcolor='tab:blue')
-
-    # Create the second y-axis for the Background, Benign, Malicious
-    ax2 = ax1.twinx()
-    ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--')
-    ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--')
-    ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--')
-    ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red')
-    
-    # Set appropriate scale for right y-axis based on the data
-    ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max())
-    ax2.tick_params(axis='y', labelcolor='tab:red')
-
-    # Annotating Total labels as text on the plot
-    for i, value in enumerate(df["Total labels"]):
-        ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom')
-
-    # Adding title and legend with experiment number in title
-    plt.title(f'Training performance - Experiment {experiment_number}')
-    fig.tight_layout()
+    ax1.set_ylabel('Label Counts')
+    # No log scale here
+    ax1.set_title(f'Label Counts - Experiment {experiment_number}')
+    ax1.legend()
+    ax1.yaxis.set_major_locator(ticker.MaxNLocator(70))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png'))
+
+    # --- Plot 2: FNR and FPR (log scale) ---
+    fig2, ax2 = plt.subplots(figsize=(10, 6))
+    ax2.plot(df.index, df["FNR"], label="FNR", color='red')
+    ax2.plot(df.index, df["FPR"], label="FPR", color='blue')
+    ax2.set_xlabel('Index')
+    ax2.set_ylabel('Rate')
+    ax2.set_yscale('log')
+    ax2.set_title(f'FNR and FPR - Experiment {experiment_number}')
+    ax2.legend()
+    ax2.yaxis.set_major_locator(ticker.MaxNLocator(100))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png'))
+
+    # --- Plot 3: Other metrics (log scale) ---
+    fig3, ax3 = plt.subplots(figsize=(12, 7))
+    metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"]
+    colors_rest = [
+        'tab:blue', 'tab:green', 'tab:purple', 'tab:brown',
+        'tab:gray', 'tab:pink', 'tab:olive'
+    ]
+    for metric, color in zip(metrics_rest, colors_rest):
+        ax3.plot(df.index, df[metric], label=metric, color=color)
+    ax3.set_xlabel('Index')
+    ax3.set_ylabel('Metric Value')
+    ax3.set_yscale('log')
+    ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}')
+    ax3.legend()
+    ax3.yaxis.set_major_locator(ticker.MaxNLocator(50))
+    plt.tight_layout()
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png'))
 
-    # Move both legends further to the right
-    ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1)
-    ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1)
-
-    # Increase right margin for better readability of legend
-    plt.subplots_adjust(right=0.75)
-
-    # Save plot to the same folder as the log file with experiment number in filename
-    plt.savefig(plot_file)
-
-    # Display the plot
     plt.show()
 
+    # --- Print final values in terminal ---
+    print("\nFinal values at last training step:")
+    for col in ["Total labels", "Background", "Benign", "Malicious",
+                "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]:
+        print(f"{col}: {df[col].iloc[-1]}")
+
 def main():
-    # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.")
     parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file")
     parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename")
-    
-    # Handle -h / --help
     args = parser.parse_args()
-
-    # Call the function to process the log file
     plot_log_data(args.file, args.experiment)
 
 if __name__ == "__main__":

From 4cca7685112dc012940248c7e647a56806fb5b83 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sat, 3 May 2025 21:16:01 +0000
Subject: [PATCH 439/455] Add performance metrics to the training evaluation

---
 modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++-----
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index b17a1baaf0..2c60cd4034 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,7 +10,16 @@
 import json
 import traceback
 import warnings
-import os
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.metrics import (
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    accuracy_score,
+    matthews_corrcoef,
+    recall_score,
+)
+
 
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
@@ -86,21 +95,21 @@ def write_to_training_log(self, message: str):
         except Exception as e:
             self.print(f"Error writing to training log: {e}", 0, 1)
 
-    def train(self, sum_labeled_flows):
+    def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         """
         Train a model based on the flows we receive and the labels
         """
         try:
+            # Create y_flow with the label
+            y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label)
             # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("ground_truth_label", axis=1)
             # Drop the detailed labels
             X_flow = X_flow.drop("detailed_ground_truth_label", axis=1)
             # Drop the module_labels
             X_flow = X_flow.drop("module_labels", axis=1)
-            # Create y_flow with the label
-            y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label)
 
-            # Normalize this batch of data so far. This can get progressivle slow
+            # Normalize this batch of data so far. This can get progressively slow
             X_flow = self.scaler.fit_transform(X_flow)
 
             # Count the number of labels of each type in this epoc
@@ -120,18 +129,43 @@ def train(self, sum_labeled_flows):
                 self.print("Error while calling clf.train()")
                 self.print(traceback.format_exc(), 0, 1)
 
-            # See score so far in training
-            score = self.clf.score(X_flow, y_flow)
+            # Predict on the training data
+            y_pred = self.clf.predict(X_flow)
 
-            #self.print(f"	Training Score: {score}", 1, 0)
-            #self.print(f'    Model Parameters: {self.clf.coef_}', 1, 0)
+            # For metrics, let's focus on Malicious vs Benign (ignore Background)
+            mask = (y_flow == "Malicious") | (y_flow == "Benign")
+            y_true_bin = y_flow[mask]
+            y_pred_bin = y_pred[mask]
+
+            # Map to binary: Malicious=1, Benign=0
+            y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0)
+            y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0)
+
+            # Compute confusion matrix: tn, fp, fn, tp
+            tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0)
+
+            # Compute metrics
+            FPR = fp / (fp + tn) if (fp + tn) > 0 else 0
+            TNR = tn / (tn + fp) if (tn + fp) > 0 else 0
+            TPR = tp / (tp + fn) if (tp + fn) > 0 else 0
+            FNR = fn / (fn + tp) if (fn + tp) > 0 else 0
+            F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0)
+            PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0)
+            ACCU = accuracy_score(y_true_bin, y_pred_bin)
+            MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0
+            RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0)
 
             # Store the models on disk
             self.store_model()
 
             # Log training information
-            self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}")
-            #self.write_to_training_log(f"Model parameters: {self.clf.coef_}")
+            self.write_to_training_log(
+                f"Total labels: {sum_labeled_flows}, "
+                f"Background: {epoch_label_counts['Background']}. "
+                f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. "
+                f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, "
+                f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}."
+            )
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
@@ -520,7 +554,7 @@ def main(self):
                         # for pandas
                         self.process_training_flows(self.last_number_of_flows_when_trained)
                         # Train an algorithm
-                        self.train(sum_labeled_flows)
+                        self.train(sum_labeled_flows, self.last_number_of_flows_when_trained)
                         self.last_number_of_flows_when_trained = sum_labeled_flows
 
             elif self.mode == "test":

From addd26bc0cf43e5426fd63b5dd73962c78b898dd Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Sun, 4 May 2025 12:50:46 +0000
Subject: [PATCH 440/455] Fix experiment names

---
 modules/flowmldetection/plot_train_performance.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py
index 5212dfeeaf..304f0f4ead 100644
--- a/modules/flowmldetection/plot_train_performance.py
+++ b/modules/flowmldetection/plot_train_performance.py
@@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number):
     ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta')
     ax1.set_xlabel('Index')
     ax1.set_ylabel('Label Counts')
-    # No log scale here
     ax1.set_title(f'Label Counts - Experiment {experiment_number}')
     ax1.legend()
     ax1.yaxis.set_major_locator(ticker.MaxNLocator(70))
+    ax1.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png'))
 
     # --- Plot 2: FNR and FPR (log scale) ---
     fig2, ax2 = plt.subplots(figsize=(10, 6))
@@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number):
     ax2.set_title(f'FNR and FPR - Experiment {experiment_number}')
     ax2.legend()
     ax2.yaxis.set_major_locator(ticker.MaxNLocator(100))
+    ax2.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png'))
 
     # --- Plot 3: Other metrics (log scale) ---
     fig3, ax3 = plt.subplots(figsize=(12, 7))
@@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number):
     ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}')
     ax3.legend()
     ax3.yaxis.set_major_locator(ticker.MaxNLocator(50))
+    ax3.xaxis.set_major_locator(ticker.MaxNLocator(50))
     plt.tight_layout()
-    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png'))
+    plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png'))
 
     plt.show()
 

From 01a6450fcf21b60387711cf5d2dc55800aabd5dc Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Mon, 5 May 2025 15:24:12 +0300
Subject: [PATCH 441/455] test_profiler: update unit tests

---
 tests/test_profiler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index 36733d2b8c..465bc5922b 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -467,7 +467,6 @@ def test_read_configuration(
     mock_conf.local_whitelist_path.return_value = "path/to/whitelist"
     mock_conf.ts_format.return_value = "unixtimestamp"
     mock_conf.analysis_direction.return_value = "all"
-    mock_conf.label.return_value = "malicious"
     mock_conf.get_tw_width_as_float.return_value = 1.0
     mock_conf.client_ips.return_value = ["192.168.1.1", "10.0.0.1"]
 
@@ -476,7 +475,6 @@ def test_read_configuration(
     assert profiler.local_whitelist_path == "path/to/whitelist"
     assert profiler.timeformat == "unixtimestamp"
     assert profiler.analysis_direction == "all"
-    assert profiler.label == "malicious"
     assert profiler.width == 1.0
     assert profiler.client_ips == ["192.168.1.1", "10.0.0.1"]
 

From 99a276f9caae1a8621146209d2bfdefa756a0297 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Mon, 5 May 2025 16:43:05 +0000
Subject: [PATCH 442/455] Fix that the training and testing logs files were
 appened instead of rewritten

---
 modules/flowmldetection/flowmldetection.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 2c60cd4034..9a920b4e25 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -90,7 +90,7 @@ def write_to_training_log(self, message: str):
         Write a message to the training log file.
         """
         try:
-            with open(self.training_log_path, "a") as log_file:
+            with open(self.training_log_path, "w") as log_file:
                 log_file.write(message + "\n")
         except Exception as e:
             self.print(f"Error writing to training log: {e}", 0, 1)
@@ -610,8 +610,7 @@ def main(self):
 
                         testing_log_path = "./modules/flowmldetection/testing_performance.log"
                         try:
-                            with open(testing_log_path, "a") as log_file:
-                                log_file.write("Testing Performance Log Initialized\n")
+                            with open(testing_log_path, "w") as log_file:
                                 # Log the testing performance metrics
                                 log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n")
 

From cb22b3103a300fce293bf9ab34355d774f6a2b5d Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Mon, 5 May 2025 22:45:16 +0000
Subject: [PATCH 443/455] Fix an issue of storing the new log files

---
 modules/flowmldetection/flowmldetection.py | 49 ++++++++++------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9a920b4e25..9139066f08 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -72,11 +72,19 @@ def init(self):
         self.scaler = StandardScaler()
         self.model_path = "./modules/flowmldetection/model.bin"
         self.scaler_path = "./modules/flowmldetection/scaler.bin"
-
-        # Initialize the training log file
-        self.training_log_path = "./modules/flowmldetection/training.log"
-        with open(self.training_log_path, "w") as log_file:
-            log_file.write("Training Log Initialized\n")
+        self.init_log_file()
+    
+    def init_log_file(self):
+        """
+        Init the log file for training or testing
+        """
+        if self.mode == "train":
+            # Initialize the training log file
+            self.log_path = "./modules/flowmldetection/training.log"
+        elif self.mode == "test":
+            # Initialize the testing log file
+            self.log_path = "./modules/flowmldetection/testing.log"
+        self.log_file = open(self.log_path, "w")
 
     def read_configuration(self):
         conf = ConfigParser()
@@ -85,15 +93,14 @@ def read_configuration(self):
         # in case the flows do not have a label themselves
         self.label = conf.label()
 
-    def write_to_training_log(self, message: str):
+    def write_to_log(self, message: str):
         """
-        Write a message to the training log file.
+        Write a message to the local log file.
         """
         try:
-            with open(self.training_log_path, "w") as log_file:
-                log_file.write(message + "\n")
+            self.log_file.write(message + "\n")
         except Exception as e:
-            self.print(f"Error writing to training log: {e}", 0, 1)
+            self.print(f"Error writing to log: {e}", 0, 1)
 
     def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         """
@@ -159,7 +166,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             self.store_model()
 
             # Log training information
-            self.write_to_training_log(
+            self.write_to_log(
                 f"Total labels: {sum_labeled_flows}, "
                 f"Background: {epoch_label_counts['Background']}. "
                 f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. "
@@ -169,7 +176,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         except Exception:
             self.print("Error in train().", 0, 1)
             self.print(traceback.format_exc(), 0, 1)
-            self.write_to_training_log("Error occurred during training.")
+            self.write_to_log("Error occurred during training.")
 
     def process_features(self, dataset):
         """
@@ -597,7 +604,6 @@ def main(self):
                         if not hasattr(self, 'fn'):
                             self.fn = 0
 
-
                         # Update counters based on predictions and labels
                         if pred[0] == "Malicious" and original_label == "Malicious":
                             self.tp += 1
@@ -605,19 +611,10 @@ def main(self):
                             self.tn += 1
                         elif pred[0] == "Malicious" and original_label == "Benign":
                             self.fp += 1
+                            self.write_to_log(f"False Positive Flow: {self.flow}")
                         elif pred[0] == "Benign" and original_label == "Malicious":
                             self.fn += 1
+                            self.write_to_log(f"False Negative Flow: {self.flow}")
 
-                        testing_log_path = "./modules/flowmldetection/testing_performance.log"
-                        try:
-                            with open(testing_log_path, "w") as log_file:
-                                # Log the testing performance metrics
-                                log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n")
-
-                                # Log the original flow for false positives and false negatives
-                                if pred[0] == "Malicious" and original_label == "Benign":
-                                    log_file.write(f"False Positive Flow: {self.flow}\n")
-                                elif pred[0] == "Benign" and original_label == "Malicious":
-                                    log_file.write(f"False Negative Flow: {self.flow}\n")
-                        except Exception as e:
-                            self.print(f"Error initializing testing performance log: {e}", 0, 1)
\ No newline at end of file
+                        # Log the testing performance metrics
+                        self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}")
\ No newline at end of file

From e0cc7c2f946a8fb4db664bbbc42422e6c54458a7 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Sat, 10 May 2025 16:21:08 +0300
Subject: [PATCH 444/455] enable/ disable training and testing.log with a param
 in the config file

---
 .secrets.baseline                           |   6 +-
 config/slips.yaml                           |   3 +
 modules/flowmldetection/flowmldetection.py  | 140 +++++++++++++-------
 modules/riskiq/riskiq.py                    |   2 +-
 modules/update_manager/update_manager.py    |   2 +-
 slips_files/common/parsers/config_parser.py |   7 +-
 6 files changed, 109 insertions(+), 51 deletions(-)

diff --git a/.secrets.baseline b/.secrets.baseline
index fc1ac4872e..aa5615109c 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -149,14 +149,14 @@
         "filename": "config/slips.yaml",
         "hashed_secret": "4cac50cee3ad8e462728e711eac3e670753d5016",
         "is_verified": false,
-        "line_number": 223
+        "line_number": 226
       },
       {
         "type": "Secret Keyword",
         "filename": "config/slips.yaml",
         "hashed_secret": "d033e22ae348aeb5660fc2140aec35850c4da997",
         "is_verified": false,
-        "line_number": 393
+        "line_number": 396
       }
     ],
     "dataset/test14-malicious-zeek-dir/http.log": [
@@ -7192,5 +7192,5 @@
       }
     ]
   },
-  "generated_at": "2025-05-08T14:51:28Z"
+  "generated_at": "2025-05-10T13:18:46Z"
 }
diff --git a/config/slips.yaml b/config/slips.yaml
index 1b73e7b549..ac2010e6b4 100644
--- a/config/slips.yaml
+++ b/config/slips.yaml
@@ -213,6 +213,9 @@ flowmldetection:
   # You should have trained at least once with 'Normal' data and once with
   # 'Malicious' data in order for the test to work.
   mode: test
+  # creates an extra log file called training.log/testing.log in the
+  # ouptput dir with performance metrics depending on the mode.
+  create_performance_metrics_log_files: False
 
 #############################
 virustotal:
diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9139066f08..2a515d0cfa 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -10,9 +10,8 @@
 import json
 import traceback
 import warnings
-from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.metrics import confusion_matrix
 from sklearn.metrics import (
-    confusion_matrix,
     f1_score,
     precision_score,
     accuracy_score,
@@ -37,6 +36,7 @@
     Method,
 )
 
+
 # This horrible hack is only to stop sklearn from printing those warnings
 def warn(*args, **kwargs):
     pass
@@ -73,7 +73,7 @@ def init(self):
         self.model_path = "./modules/flowmldetection/model.bin"
         self.scaler_path = "./modules/flowmldetection/scaler.bin"
         self.init_log_file()
-    
+
     def init_log_file(self):
         """
         Init the log file for training or testing
@@ -92,11 +92,16 @@ def read_configuration(self):
         # This is the global label in the configuration,
         # in case the flows do not have a label themselves
         self.label = conf.label()
+        self.enable_logs: bool = conf.create_performance_metrics_log_files()
 
     def write_to_log(self, message: str):
         """
-        Write a message to the local log file.
+        Write a message to the local log file if
+        create_performance_metrics_log_files is enabled in slips.yaml
         """
+        if not self.enable_logs:
+            return
+
         try:
             self.log_file.write(message + "\n")
         except Exception as e:
@@ -108,7 +113,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
         """
         try:
             # Create y_flow with the label
-            y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label)
+            y_flow = numpy.full(
+                self.flows.shape[0], self.flows.ground_truth_label
+            )
             # Create X_flow with the current flows minus the label
             X_flow = self.flows.drop("ground_truth_label", axis=1)
             # Drop the detailed labels
@@ -130,7 +137,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             try:
                 # Online incremental learning
                 self.clf.partial_fit(
-                    X_flow, y_flow, classes=["Background", "Malicious", "Benign"]
+                    X_flow,
+                    y_flow,
+                    classes=["Background", "Malicious", "Benign"],
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -149,7 +158,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0)
 
             # Compute confusion matrix: tn, fp, fn, tp
-            tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0)
+            tn, fp, fn, tp = (
+                confusion_matrix(y_true_bin, y_pred_bin, labels=[0, 1]).ravel()
+                if len(set(y_true_bin)) > 1
+                else (0, 0, 0, 0)
+            )
 
             # Compute metrics
             FPR = fp / (fp + tn) if (fp + tn) > 0 else 0
@@ -159,7 +172,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0)
             PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0)
             ACCU = accuracy_score(y_true_bin, y_pred_bin)
-            MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0
+            MCC = (
+                matthews_corrcoef(y_true_bin, y_pred_bin)
+                if len(set(y_true_bin)) > 1
+                else 0
+            )
             RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0)
 
             # Store the models on disk
@@ -189,7 +206,8 @@ def process_features(self, dataset):
             for proto in to_discard:
                 dataset = dataset[dataset.proto != proto]
 
-            # If te proto is in the list to delete and there is only one flow, then the dataset will be empty
+            # If te proto is in the list to delete and there is only one flow,
+            # then the dataset will be empty
             if dataset.empty:
                 # DataFrame is empty now, so return empty
                 return dataset
@@ -295,7 +313,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
             if last_number_of_flows_when_trained is None:
                 last_number_of_flows_when_trained = 0
             else:
-                last_number_of_flows_when_trained = int(last_number_of_flows_when_trained)
+                last_number_of_flows_when_trained = int(
+                    last_number_of_flows_when_trained
+                )
 
             # We get all the flows so far
             flows = self.db.get_all_flows()
@@ -399,21 +419,21 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
             ]
             # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
             # Error
-            ''' [Flow ML Detection] Error in detect() while processing                                                                                                                                                                                                                         
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes                                                                                                                                                                                    
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887                                                                                                                                                                                    
-            The feature names should match those that were passed during fit.                                                                                                                                                                                                              
-            Feature names unseen at fit time:                                                                                                                                                                                                                                              
-            - bytes     
-            '''
+            """ [Flow ML Detection] Error in detect() while processing
+            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes
+            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887
+            The feature names should match those that were passed during fit.
+            Feature names unseen at fit time:
+            - bytes
+            """
 
             # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing 
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes                                                                                                                                                             
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887                                                                                                                                                             
-            # The feature names should match those that were passed during fit.                                                                                                                                                                                
-            # Feature names must be in the same order as they were in fit.                                                                                                                                                                                     
-                                                                                                                                                                                                                                                 
+            # [Flow ML Detection] Error in detect() while processing
+            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes
+            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887
+            # The feature names should match those that were passed during fit.
+            # Feature names must be in the same order as they were in fit.
+
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)
@@ -540,17 +560,19 @@ def main(self):
                 labels = self.db.get_labels()
                 sum_labeled_flows = sum(i[1] for i in labels)
 
-                # The min labels to retrain is the min number of flows 
+                # The min labels to retrain is the min number of flows
                 # we should have seen so far in this capture to start training
                 # This is so we dont _start_ training with only 1 flow
 
-                # Once we are over the start minimum, the second condition is 
+                # Once we are over the start minimum, the second condition is
                 # to force to retrain every a minimum_labels_to_retrain number
                 # of flows. So we dont retrain every 1 flow.
-                if (
-                    sum_labeled_flows >= self.minimum_labels_to_start_train
-                ):
-                    if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain):
+                if sum_labeled_flows >= self.minimum_labels_to_start_train:
+                    if (
+                        sum_labeled_flows
+                        - self.last_number_of_flows_when_trained
+                        >= self.minimum_labels_to_retrain
+                    ):
                         # So for example we retrain every 50 labels and only when
                         # we have at least 50 labels
                         self.print(
@@ -559,10 +581,17 @@ def main(self):
                         )
                         # Process all flows in the DB and make them ready
                         # for pandas
-                        self.process_training_flows(self.last_number_of_flows_when_trained)
+                        self.process_training_flows(
+                            self.last_number_of_flows_when_trained
+                        )
                         # Train an algorithm
-                        self.train(sum_labeled_flows, self.last_number_of_flows_when_trained)
-                        self.last_number_of_flows_when_trained = sum_labeled_flows
+                        self.train(
+                            sum_labeled_flows,
+                            self.last_number_of_flows_when_trained,
+                        )
+                        self.last_number_of_flows_when_trained = (
+                            sum_labeled_flows
+                        )
 
             elif self.mode == "test":
                 # We are testing, which means using the model to detect
@@ -570,7 +599,9 @@ def main(self):
                 # After processing the flow, it may happen that we
                 # delete icmp/arp/etc so the dataframe can be empty
                 if processed_flow is not None and not processed_flow.empty:
-                    original_label = processed_flow["ground_truth_label"].iloc[0]
+                    original_label = processed_flow["ground_truth_label"].iloc[
+                        0
+                    ]
                     # Predict
                     pred: numpy.ndarray = self.detect(processed_flow)
                     if not pred:
@@ -591,30 +622,49 @@ def main(self):
                             2,
                         )
 
-                    # So you can disable this code easily. Since it is used only for evaluating a testing
+                    # So you can disable this code easily. Since it is used
+                    # only for evaluating a testing
                     log_testing_data = True
                     if log_testing_data:
                         # Initialize counters if not already done
-                        if not hasattr(self, 'tp'):
+                        if not hasattr(self, "tp"):
                             self.tp = 0
-                        if not hasattr(self, 'tn'):
+                        if not hasattr(self, "tn"):
                             self.tn = 0
-                        if not hasattr(self, 'fp'):
+                        if not hasattr(self, "fp"):
                             self.fp = 0
-                        if not hasattr(self, 'fn'):
+                        if not hasattr(self, "fn"):
                             self.fn = 0
 
                         # Update counters based on predictions and labels
-                        if pred[0] == "Malicious" and original_label == "Malicious":
+                        if (
+                            pred[0] == "Malicious"
+                            and original_label == "Malicious"
+                        ):
                             self.tp += 1
-                        elif pred[0] == "Benign" and original_label == "Benign":
+                        elif (
+                            pred[0] == "Benign" and original_label == "Benign"
+                        ):
                             self.tn += 1
-                        elif pred[0] == "Malicious" and original_label == "Benign":
+                        elif (
+                            pred[0] == "Malicious"
+                            and original_label == "Benign"
+                        ):
                             self.fp += 1
-                            self.write_to_log(f"False Positive Flow: {self.flow}")
-                        elif pred[0] == "Benign" and original_label == "Malicious":
+                            self.write_to_log(
+                                f"False Positive Flow: {self.flow}"
+                            )
+                        elif (
+                            pred[0] == "Benign"
+                            and original_label == "Malicious"
+                        ):
                             self.fn += 1
-                            self.write_to_log(f"False Negative Flow: {self.flow}")
+                            self.write_to_log(
+                                f"False Negative Flow: {self.flow}"
+                            )
 
                         # Log the testing performance metrics
-                        self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}")
\ No newline at end of file
+                        self.write_to_log(
+                            f"TP: {self.tp}, TN: {self.tn},"
+                            f" FP: {self.fp}, FN: {self.fn}"
+                        )
diff --git a/modules/riskiq/riskiq.py b/modules/riskiq/riskiq.py
index 5abf2ddb19..7b5653997e 100644
--- a/modules/riskiq/riskiq.py
+++ b/modules/riskiq/riskiq.py
@@ -25,7 +25,7 @@ def init(self):
 
     def read_configuration(self):
         conf = ConfigParser()
-        risk_iq_credentials_path = conf.RiskIQ_credentials_path()
+        risk_iq_credentials_path = conf.risk_iq_credentials_path()
         try:
             with open(risk_iq_credentials_path, "r") as f:
                 self.riskiq_email = f.readline().replace("\n", "")
diff --git a/modules/update_manager/update_manager.py b/modules/update_manager/update_manager.py
index ba8106aa5c..b791bfc137 100644
--- a/modules/update_manager/update_manager.py
+++ b/modules/update_manager/update_manager.py
@@ -119,7 +119,7 @@ def read_riskiq_creds(risk_iq_credentials_path):
         self.ssl_feeds_path = conf.ssl_feeds()
         self.ssl_feeds = self.get_feed_details(self.ssl_feeds_path)
 
-        risk_iq_credentials_path = conf.RiskIQ_credentials_path()
+        risk_iq_credentials_path = conf.risk_iq_credentials_path()
         read_riskiq_creds(risk_iq_credentials_path)
         self.riskiq_update_period = conf.riskiq_update_period()
 
diff --git a/slips_files/common/parsers/config_parser.py b/slips_files/common/parsers/config_parser.py
index 40f1b044bc..e208f78816 100644
--- a/slips_files/common/parsers/config_parser.py
+++ b/slips_files/common/parsers/config_parser.py
@@ -418,7 +418,12 @@ def data_exfiltration_threshold(self):
     def get_ml_mode(self):
         return self.read_configuration("flowmldetection", "mode", "test")
 
-    def RiskIQ_credentials_path(self):
+    def create_performance_metrics_log_files(self) -> bool:
+        return self.read_configuration(
+            "flowmldetection", "create_performance_metrics_log_files", False
+        )
+
+    def risk_iq_credentials_path(self):
         return self.read_configuration(
             "threatintelligence", "RiskIQ_credentials_path", ""
         )

From adcbafd997d538cf7d8041f6317dd48f3cef0f54 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Sat, 10 May 2025 16:23:58 +0300
Subject: [PATCH 445/455] dont create an empty logfile when
 create_performance_metrics_log_files is set to false

---
 modules/flowmldetection/flowmldetection.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 2a515d0cfa..9305197d3e 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -78,6 +78,9 @@ def init_log_file(self):
         """
         Init the log file for training or testing
         """
+        if not self.enable_logs:
+            return
+
         if self.mode == "train":
             # Initialize the training log file
             self.log_path = "./modules/flowmldetection/training.log"

From c45e77594002748fdd1e2c5ddd559c92416eb3f5 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Sat, 10 May 2025 16:29:30 +0300
Subject: [PATCH 446/455] when enabled, create testing.log or training.log in
 the current output dir

---
 modules/flowmldetection/flowmldetection.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index 9305197d3e..f618195bce 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -3,6 +3,7 @@
 
 # SPDX-License-Identifier: GPL-2.0-only
 import numpy
+import os
 from sklearn.linear_model import SGDClassifier
 from sklearn.preprocessing import StandardScaler
 import pickle
@@ -83,10 +84,10 @@ def init_log_file(self):
 
         if self.mode == "train":
             # Initialize the training log file
-            self.log_path = "./modules/flowmldetection/training.log"
+            self.log_path = os.path.join(self.output_dir, "training.log")
         elif self.mode == "test":
             # Initialize the testing log file
-            self.log_path = "./modules/flowmldetection/testing.log"
+            self.log_path = os.path.join(self.output_dir, "testing.log")
         self.log_file = open(self.log_path, "w")
 
     def read_configuration(self):

From b2452494a0d32f394b5ddc15e5cb6afc47df2855 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Sat, 10 May 2025 16:43:32 +0300
Subject: [PATCH 447/455] Add an enum called labels with either Benign or
 Malicious so the labels are unified.

---
 modules/flowmldetection/flowmldetection.py | 65 +++++++++++-----------
 slips_files/core/structures/labels.py      | 11 ++++
 2 files changed, 43 insertions(+), 33 deletions(-)
 create mode 100644 slips_files/core/structures/labels.py

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index f618195bce..e828058ee4 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -19,11 +19,10 @@
     matthews_corrcoef,
     recall_score,
 )
-
-
 from slips_files.common.parsers.config_parser import ConfigParser
 from slips_files.common.slips_utils import utils
 from slips_files.common.abstracts.module import IModule
+from slips_files.core.structures.labels import Label
 from slips_files.core.structures.evidence import (
     Evidence,
     ProfileID,
@@ -45,6 +44,10 @@ def warn(*args, **kwargs):
 
 warnings.warn = warn
 
+BACKGROUND = Label.BACKGROUND.name
+BENIGN = Label.BENIGN.name
+MALICIOUS = Label.MALICIOUS.name
+
 
 class FlowMLDetection(IModule):
     # Name: short name of the module. Do not use spaces
@@ -132,9 +135,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
 
             # Count the number of labels of each type in this epoc
             epoch_label_counts = {
-                "Background": (y_flow == "Background").sum(),
-                "Malicious": (y_flow == "Malicious").sum(),
-                "Benign": (y_flow == "Benign").sum(),
+                BACKGROUND: (y_flow == BACKGROUND).sum(),
+                MALICIOUS: (y_flow == MALICIOUS).sum(),
+                BENIGN: (y_flow == BENIGN).sum(),
             }
 
             # Train
@@ -143,7 +146,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
                 self.clf.partial_fit(
                     X_flow,
                     y_flow,
-                    classes=["Background", "Malicious", "Benign"],
+                    classes=[BACKGROUND, MALICIOUS, BENIGN],
                 )
             except Exception:
                 self.print("Error while calling clf.train()")
@@ -153,13 +156,13 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             y_pred = self.clf.predict(X_flow)
 
             # For metrics, let's focus on Malicious vs Benign (ignore Background)
-            mask = (y_flow == "Malicious") | (y_flow == "Benign")
+            mask = (y_flow == MALICIOUS) | (y_flow == BENIGN)
             y_true_bin = y_flow[mask]
             y_pred_bin = y_pred[mask]
 
             # Map to binary: Malicious=1, Benign=0
-            y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0)
-            y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0)
+            y_true_bin = numpy.where(y_true_bin == MALICIOUS, 1, 0)
+            y_pred_bin = numpy.where(y_pred_bin == MALICIOUS, 1, 0)
 
             # Compute confusion matrix: tn, fp, fn, tp
             tn, fp, fn, tp = (
@@ -190,9 +193,12 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained):
             self.write_to_log(
                 f"Total labels: {sum_labeled_flows}, "
                 f"Background: {epoch_label_counts['Background']}. "
-                f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. "
-                f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, "
-                f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}."
+                f"Benign: {epoch_label_counts['Benign']}. "
+                f"Malicious: {epoch_label_counts[MALICIOUS]}. "
+                f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, "
+                f"TPR={TPR:.4f}, FNR={FNR:.4f}, "
+                f"F1={F1:.4f}, Precision={PREC:.4f}, "
+                f"Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}."
             )
         except Exception:
             self.print("Error in train().", 0, 1)
@@ -345,9 +351,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 25517,
                         "dbytes": 17247,
                         "appproto": "ssl",
-                        "ground_truth_label": "Malicious",
+                        "ground_truth_label": MALICIOUS,
                         "module_labels": {
-                            "flowalerts-long-connection": "Malicious"
+                            "flowalerts-long-connection": MALICIOUS
                         },
                     }
                 )
@@ -366,9 +372,9 @@ def process_training_flows(self, last_number_of_flows_when_trained):
                         "sbytes": 100,
                         "dbytes": 67596,
                         "appproto": "http",
-                        "ground_truth_label": "Benign",
+                        "ground_truth_label": BENIGN,
                         "module_labels": {
-                            "flowalerts-long-connection": "Benign"
+                            "flowalerts-long-connection": BENIGN
                         },
                     }
                 )
@@ -421,7 +427,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "ground_truth_label",
                 "detailed_ground_truth_label",
             ]
-            # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes.
+            # For argus binetflows this fails because ther is a field calle
+            # bytes that was not in other flows. It should be called allbytes.
             # Error
             """ [Flow ML Detection] Error in detect() while processing
             dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes
@@ -546,8 +553,8 @@ def main(self):
             self.twid = msg["twid"]
             self.profileid = msg["profileid"]
             self.flow = msg["flow"]
-            # These following extra fields are expected in testing. update the original
-            # flow dict to have them
+            # These following extra fields are expected in testing.
+            # update the original flow dict to have them
             self.flow.update(
                 {
                     "state": msg["interpreted_state"],
@@ -612,7 +619,7 @@ def main(self):
                         # an error occurred
                         return
 
-                    if pred[0] == "Malicious":
+                    if pred[0] == MALICIOUS:
                         # Generate an alert
                         self.set_evidence_malicious_flow(self.flow, self.twid)
                         self.print(
@@ -642,26 +649,18 @@ def main(self):
 
                         # Update counters based on predictions and labels
                         if (
-                            pred[0] == "Malicious"
-                            and original_label == "Malicious"
+                            pred[0] == MALICIOUS
+                            and original_label == MALICIOUS
                         ):
                             self.tp += 1
-                        elif (
-                            pred[0] == "Benign" and original_label == "Benign"
-                        ):
+                        elif pred[0] == BENIGN and original_label == BENIGN:
                             self.tn += 1
-                        elif (
-                            pred[0] == "Malicious"
-                            and original_label == "Benign"
-                        ):
+                        elif pred[0] == MALICIOUS and original_label == BENIGN:
                             self.fp += 1
                             self.write_to_log(
                                 f"False Positive Flow: {self.flow}"
                             )
-                        elif (
-                            pred[0] == "Benign"
-                            and original_label == "Malicious"
-                        ):
+                        elif pred[0] == BENIGN and original_label == MALICIOUS:
                             self.fn += 1
                             self.write_to_log(
                                 f"False Negative Flow: {self.flow}"
diff --git a/slips_files/core/structures/labels.py b/slips_files/core/structures/labels.py
new file mode 100644
index 0000000000..b1dc64234e
--- /dev/null
+++ b/slips_files/core/structures/labels.py
@@ -0,0 +1,11 @@
+from enum import Enum
+
+
+class Label(Enum):
+    """
+    label of flows should be one of the following
+    """
+
+    MALICIOUS = "Malicious"
+    BENIGN = "Benign"
+    BACKGROUND = "Background"

From 31a49bdefd4bbb1cfe7834b59dbfc9e137a66418 Mon Sep 17 00:00:00 2001
From: alya <alyaggomaa@gmail.com>
Date: Mon, 12 May 2025 20:57:25 +0300
Subject: [PATCH 448/455] set the config label as the GT label if not founf in
 the given file

---
 modules/flowmldetection/flowmldetection.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index e828058ee4..c2b184cb10 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -98,7 +98,7 @@ def read_configuration(self):
         self.mode = conf.get_ml_mode()
         # This is the global label in the configuration,
         # in case the flows do not have a label themselves
-        self.label = conf.label()
+        self.ground_truth_config_label = conf.label()
         self.enable_logs: bool = conf.create_performance_metrics_log_files()
 
     def write_to_log(self, message: str):
@@ -610,9 +610,15 @@ def main(self):
                 # After processing the flow, it may happen that we
                 # delete icmp/arp/etc so the dataframe can be empty
                 if processed_flow is not None and not processed_flow.empty:
-                    original_label = processed_flow["ground_truth_label"].iloc[
-                        0
-                    ]
+                    try:
+                        original_label = processed_flow[
+                            "ground_truth_label"
+                        ].iloc[0]
+                    except KeyError:
+                        # If there are no labels in the flows, the default
+                        # label should be the one in the config file.
+                        original_label = self.ground_truth_config_label
+
                     # Predict
                     pred: numpy.ndarray = self.detect(processed_flow)
                     if not pred:

From a6ad940c2b134f1dd220e07f4f2d16419d545f08 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Tue, 20 May 2025 11:13:27 +0000
Subject: [PATCH 449/455] By default train and store logs

---
 config/slips.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/slips.yaml b/config/slips.yaml
index ac2010e6b4..635df7f918 100644
--- a/config/slips.yaml
+++ b/config/slips.yaml
@@ -212,10 +212,10 @@ flowmldetection:
   # training the models, to test in unknown data.
   # You should have trained at least once with 'Normal' data and once with
   # 'Malicious' data in order for the test to work.
-  mode: test
+  mode: train
   # creates an extra log file called training.log/testing.log in the
   # ouptput dir with performance metrics depending on the mode.
-  create_performance_metrics_log_files: False
+  create_performance_metrics_log_files: True
 
 #############################
 virustotal:

From c7ab0a2c2ee14ddc5b009bc4011095b0ae2044f4 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Tue, 20 May 2025 11:13:56 +0000
Subject: [PATCH 450/455] Fix the labels to .value

---
 modules/flowmldetection/flowmldetection.py | 28 ++++------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py
index c2b184cb10..4ef661146e 100644
--- a/modules/flowmldetection/flowmldetection.py
+++ b/modules/flowmldetection/flowmldetection.py
@@ -44,9 +44,9 @@ def warn(*args, **kwargs):
 
 warnings.warn = warn
 
-BACKGROUND = Label.BACKGROUND.name
-BENIGN = Label.BENIGN.name
-MALICIOUS = Label.MALICIOUS.name
+BACKGROUND = Label.BACKGROUND.value
+BENIGN = Label.BENIGN.value
+MALICIOUS = Label.MALICIOUS.value
 
 
 class FlowMLDetection(IModule):
@@ -287,7 +287,7 @@ def process_features(self, dataset):
                 r"(^.*arp.*$)", "4", regex=True
             )
 
-            dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"]
+            dataset["bytes"] = dataset["sbytes"] + dataset["dbytes"]
             dataset["pkts"] = dataset["spkts"] + dataset["dpkts"]
 
             fields_to_convert_to_float = [
@@ -297,7 +297,7 @@ def process_features(self, dataset):
                 dataset.dur,
                 dataset.pkts,
                 dataset.spkts,
-                dataset.allbytes,
+                dataset.bytes,
                 dataset.sbytes,
                 dataset.state,
             ]
@@ -427,24 +427,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]:
                 "ground_truth_label",
                 "detailed_ground_truth_label",
             ]
-            # For argus binetflows this fails because ther is a field calle
-            # bytes that was not in other flows. It should be called allbytes.
-            # Error
-            """ [Flow ML Detection] Error in detect() while processing
-            dur proto  sport dport  state  pkts  spkts  dpkts  bytes  sbytes  dbytes  allbytes
-            0  63.822830     0  56119   981    0.0    15     15      0   8764    1887       0      1887
-            The feature names should match those that were passed during fit.
-            Feature names unseen at fit time:
-            - bytes
-            """
-
-            # IF we delete here the filed bytes the error is
-            # [Flow ML Detection] Error in detect() while processing
-            # dur proto sport dport  state  pkts  spkts  dpkts  sbytes  dbytes allbytes
-            # 0  63.822830     0  56120   980    0.0    15     15      0    1887       0      1887
-            # The feature names should match those that were passed during fit.
-            # Feature names must be in the same order as they were in fit.
-
             for field in fields_to_drop:
                 try:
                     x_flow = x_flow.drop(field, axis=1)

From 992496bc616e00620b7f37f5043c7941da5b9505 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Thu, 29 May 2025 11:32:43 +0000
Subject: [PATCH 451/455] Fix plot testing bug with zeros

---
 modules/flowmldetection/plot_testing_performance.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index 6865415cdf..dc649b5996 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -121,13 +121,15 @@ def plot_single_group(metrics_dict, output_filename, experiment_number, is_close
     if is_close_to_0:
         min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR']))
         max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR']))
-        
+
         # Avoid log(0), so set the minimum limit a little higher than zero
         if min_val == 0:
             min_val = 1e-4  # Avoid zero values on the logarithmic scale
 
         plt.ylim(min_val, max_val)  # Set Y-axis limits based on the data range
-        plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60))  # Set ticks logarithmically
+        # Ensure ticks are within the valid range
+        if min_val > 0 and max_val > 0:
+            plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6))  # Set ticks logarithmically
 
     # Add the experiment number to the plot title
     plt.xlabel('Index')

From fe02fc0e4e476f560e96ad1d2d2d9d99333b0854 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Fri, 11 Jul 2025 09:53:56 +0000
Subject: [PATCH 452/455] Improve the testing of performance

---
 .../plot_testing_performance.py               | 87 ++++++++++++-------
 1 file changed, 55 insertions(+), 32 deletions(-)

diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py
index dc649b5996..f0f9b8f2d0 100644
--- a/modules/flowmldetection/plot_testing_performance.py
+++ b/modules/flowmldetection/plot_testing_performance.py
@@ -15,42 +15,59 @@ def process_file(file_path):
     MCC_values = []
     recall_values = []
     
+    # Counters for error tracking
+    total_lines = 0
+    error_lines = 0
+    unusual_lines = 0
+
     # Read the file and extract the data
     with open(file_path, 'r') as file:
         for line in file:
+            total_lines += 1
             if "TP:" in line:
-                # Extract the values from the line
-                parts = line.split(',')
-                TP = int(parts[0].split(':')[1].strip())
-                TN = int(parts[1].split(':')[1].strip())
-                FP = int(parts[2].split(':')[1].strip())
-                FN = int(parts[3].split(':')[1].strip())
-
-                # Calculate metrics
-                FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
-                FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
-                TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
-                TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
-                Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
-                Recall = TPR  # Recall is the same as TPR
-                F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
-                Accuracy = (TP + TN) / (TP + TN + FP + FN)
-                MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
+                try:
+                    # Extract the values from the line
+                    parts = line.split(',')
+                    TP = int(parts[0].split(':')[1].strip())
+                    TN = int(parts[1].split(':')[1].strip())
+                    FP = int(parts[2].split(':')[1].strip())
+                    FN = int(parts[3].split(':')[1].strip())
+
+                    # Calculate metrics
+                    FPR = FP / (FP + TN) if (FP + TN) != 0 else 0
+                    FNR = FN / (FN + TP) if (FN + TP) != 0 else 0
+                    TNR = TN / (TN + FP) if (TN + FP) != 0 else 0
+                    TPR = TP / (TP + FN) if (TP + FN) != 0 else 0
+                    Precision = TP / (TP + FP) if (TP + FP) != 0 else 0
+                    Recall = TPR  # Recall is the same as TPR
+                    F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0
+                    Accuracy = (TP + TN) / (TP + TN + FP + FN)
+                    MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0
                 
-                # Append the values to the respective lists
-                FPR_values.append(FPR)
-                FNR_values.append(FNR)
-                TNR_values.append(TNR)
-                TPR_values.append(TPR)
-                F1_values.append(F1)
-                accuracy_values.append(Accuracy)
-                precision_values.append(Precision)
-                MCC_values.append(MCC)
-                recall_values.append(Recall)
-    
-    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values
+                    # Append the values to the respective lists
+                    FPR_values.append(FPR)
+                    FNR_values.append(FNR)
+                    TNR_values.append(TNR)
+                    TPR_values.append(TPR)
+                    F1_values.append(F1)
+                    accuracy_values.append(Accuracy)
+                    precision_values.append(Precision)
+                    MCC_values.append(MCC)
+                    recall_values.append(Recall)
 
-def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number):
+                except Exception as e:
+                    error_lines += 1
+                    print(f"Error in line {total_lines}: {e}")
+                    continue
+                
+                # Check for any unusual cases
+                if any(np.isnan([FPR, FNR, TNR, TPR, F1, Accuracy, Precision, MCC, Recall])):
+                    unusual_lines += 1
+                    print(f"Unusual values in line {total_lines}: NaN values found")
+
+    return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, total_lines, error_lines, unusual_lines
+
+def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number, total_lines, error_lines, unusual_lines):
     # Separate the values into two groups based on their proximity to 0 or 1
     close_to_0 = {
         'FPR': [], 'FNR': []
@@ -90,6 +107,12 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu
     print(f"Final MCC: {MCC_values[-1]:.4f}")
     print(f"Final Recall: {recall_values[-1]:.4f}")
 
+    # Print summary statistics
+    print(f"\nSummary for Experiment {experiment_number}:")
+    print(f"Total lines read: {total_lines}")
+    print(f"Lines with errors: {error_lines}")
+    print(f"Unusual lines (NaN values): {unusual_lines}")
+
 def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False):
     plt.figure(figsize=(12, 8))
     
@@ -152,8 +175,8 @@ def main():
     file_path = args.file
     experiment_number = args.experiment
     
-    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path)
-    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number)
+    FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, total_lines, error_lines, unusual_lines = process_file(file_path)
+    plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number, total_lines, error_lines, unusual_lines)
 
 if __name__ == "__main__":
     main()

From 480c398dc578d11863e23c8d6a11f22f920286e1 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Fri, 3 Oct 2025 08:30:48 +0000
Subject: [PATCH 453/455] Update temporary model after latest train

---
 modules/flowmldetection/model.bin | Bin 1124 -> 1376 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index a6648cf72179520975b0e9ad1164f7d574e87140..bb6d471c12e299775e308ecffe6231d20a68614d 100644
GIT binary patch
delta 468
zcmV;_0W1FG2;d3@fCQCQ1+fJq0$BrvlxQQ21!Q!2aAlN}l#ByaF));gh=-I?lqgFB
zi~&59PEJlsC`$xO2y~QUi9>io0001C0001E0001M0001I0001T0001Q0001W0001P
z0001F0001!SOO%IcmgN^0F#{pG64gV&;lq47ytkO0002+Ka=?aE(8Nh3x<=?0w#Y#
z2mt^9VH!esD<I>)l#8~zU<=kitBd)tjQifd=%5y{8nf}gTK#q0ZbP0w1eN4Z6B^3D
zdT>qBmRh|(&ib<JeQfu?v{9t;m9*r)hzF4zuYl`6V<C2;1bVW+oG%1LQ8wm3v*JIM
za3TOekvB|kvqjFo_JJ;|s8I1gS5|-c<<BkNKdEy@7rN5yzc<GgJ_@thzYel~d_v6_
zK*u6C2R$F`KSH_1HNpcBz>EGe#&}E6zhWwhH_|!Xzdg^FWtRK}z^k-B5~rHQKeI~2
z#*byUzf-~~nj}@lKeZDj7-4AfKMO=*3#Sg@KLd69ceIuFzi*DKYLgSvzsU|S%gVDl
z<iGbIXy-zNy_2c~Bp?HYlxRkYLl~g3btOFJ0>GljIUu^^-@k!@qUochmOqo;15^Y-
K6~ly+HUut&JHl=N

delta 217
zcmV;~04D$73gid{fCQCU1hEAp0x$xFlxQQ21!Q!2aAlN}l#BvZIh2Wrhm=y3C`$v3
z0X&pWPEJcCO9V>@bd+L=Lr9Z>0wj~E0wj~s0xtmqlK}%L0T7ci11<pplVJlXSwmQ*
z&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1();YA3+ulRv$a;~zkIU#E>o
zcI-ba#L|>%Hv~ZN`Y<G(@KnDch(0mn<s!fm-ctwS<Lr}`1SAjvg_LMJi9-k>9Z*;7
TCnUg=+5}VtK<@DXlQIP^UwKv4


From d4675c0a4f4304e4bcb6720a4371da90560d38e7 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Fri, 3 Oct 2025 08:30:57 +0000
Subject: [PATCH 454/455] Update temporary scaler after latest train

---
 modules/flowmldetection/scaler.bin | Bin 890 -> 887 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin
index 17115724b9536f6093f9d72f3b58a5c22c562a9a..a62890d0b0b019a02a84ad2b0ff861c378dba82a 100644
GIT binary patch
delta 302
zcmeyx_MMHTfn{pWMivJ~M%Kw5jM+@g43oDq`iEzv)G}OnDCEEZ1*=!BT9qaG!XXp{
zd}l!DP-O>@_^MSJV08^!kAV4E{CA-GKG=hli<&-$^35hkGpU7NJYc>4w~{ebjDg`q
zq0DhrK0U``nd5~&w|Y7Pxj^LV@Ytg7u?1f|$V{Qizq1{IqSrhQ9d~4af+CsYu?JKR
zIfC8bRXq6ylScT-MS}l2>II=@f3QElNKZzGx!>VV#JT@}*UfQ|l(tvmOOtZwy)EWZ
u@1^G8G%-@4Tl1*HUdK1qhbG;I>tj5<_++uf4Tp96OU;%{G@KmHtPB8KS8H7W

delta 305
zcmV-10nYyS2KojBfCQCqu>?Q?1PEblY?DO+YXW-(ld=I%SGXcJ>b7ZMK%g&y^bA{P
zKSt)3zw#dLK%p_vF&##uK+P0N9`1!QK(D}{{TAjcK;bH{z@TriKo{mo(f{JIK;I}}
zgOE7xKYWP%NKmQ4K$1F*&U`#TlUM>KSKJP<820ERK@bAa4gpGMKR>OMx}hRxK{M=6
zv(jsGK{@t2q?-|%KnIzEbxq24Kq}lf4=@0pLA97c_0;#zK?}1s3)$hSKi+W2x_dU#
zL4x+q5;T^nlhpz!SCCH4+`&DGK%YiiXq`-}KS?vLDu6YsK$Vk(#kQcaKz&31K1^&x
zK*urfpL`fIKz5|BUcczYK%idWH|qP}K%c;h+h$bFKXB%NfD#hnK+$a-p+B8WlT-sE
DDj1Bc


From 9c6ae9162ad8df4f556165612aadb3b6f9a6ee24 Mon Sep 17 00:00:00 2001
From: Seba Garcia <sebastian.garcia@agents.fel.cvut.cz>
Date: Wed, 5 Nov 2025 10:25:24 +0000
Subject: [PATCH 455/455] New model.bin model

---
 modules/flowmldetection/model.bin | Bin 1376 -> 1376 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin
index bb6d471c12e299775e308ecffe6231d20a68614d..5c305d38340560647ab0e165c12a91177a9d41c4 100644
GIT binary patch
delta 312
zcmV-80muH}3g8N`j{|?BD=C{t`QN{#yJevcJj_2ybu_XJg4w^76fCY!tn9xXHX@t1
z{iHwPHgJQ>Y`VYL@|+q%rm{a3Ii`iVs`$Uu?0UhEDc-*yHfRqPN9R8_+6GX)Wy8O2
zC$u)|R@*-n0CRdz_4_}n-!$F1yw$%4YG6}T-r_$rb|o2CA>@BQ5AIQ~iIwfY$MbSJ
zVV2mxXQf&Dc#If8uJsVkVUyiI|5cp@RGtvP;S0ro<we84=c!v~l|SLXa4XD2?pFlB
zgPd=Gk-W4&j?(2VvnhnX52_+i1Dw%6%aFMUgjexD{>!nI!mH#zrC>~QO7Hf+Gf|1O
zaKg*K)D2>CGBOF_zXq44Qa#zwla>T57vA?Iw?vHrz|C3+)w#UkztYwS^oJV3li37U
K0+<++G6gQ4zMo+L

delta 312
zcmV-80muH}3g8N`j{|>U8bWw0AmhK3i?+L93)VlYi}|pO``*9kpcb(jv+=)L{dL@K
zL!LhbmE=zo8p^+Va81&dTD?Ec`m*bNZ1=ykQKa&fwB)~t2az1Ffa^bFA$Fn!da}Qq
zF9bzVHs(LG;y;yeA^<><H%xA`Mb5wWfiA14Q1L%kR`})5E#7}WsdGjby3*{wH^&w}
z3bWb24zhlHLd_UJ$09ceJs<2pLb=5?!UGV%i~cgkcuUZ~Vk(I@(mCC~J<peAmiz_4
ztF%87r<%n-vr5Fqk7c*NQ^F~lBvr*fwG$;6VQBF`3q)ZHrw-vi19kg%w3YY2Z;q>K
zlM~Xv$uG;wvpNanzxN<$=R$<Nla>T57of3qB|PQ=z@o=FAiCt=zkz|G>7%8Vli37U
K0>gxpG6gPim7N*@