From 3070c023f49abc6fce93af36a6740cd8034a09ca Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 001/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 155 ++++++++++++++++++++- 1 file changed, 152 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b3e98c5150..124ec61f91 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,6 +5,7 @@ import json import datetime import traceback +import sys from slips_files.common.imports import * from slips_files.core.evidence_structure.evidence import ( @@ -112,6 +113,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -124,14 +260,18 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", "saddr", "ts", "origstate", - "flow_type", + "type_", + "dir_", + "history", + "dbytes", + "dpkts", "smac", "dmac", ] @@ -141,13 +281,22 @@ def process_features(self, dataset): except ValueError: pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) + + # Convert categories to floats dataset.state = dataset.state.astype("float64") # Convert proto to categorical. For now we only have few states, so we can hardcode... From df6e9196532d0ba050f4922a36ed1d2b1a2638b5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 002/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 1497681f7a9b2f7d20a1c1e570646ca3b2c2bdbc Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 003/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 2eeb3ceb889625d179e07beb5e01e589d553ccf2 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 004/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 169 ++---------------- slips_files/core/database/database_manager.py | 3 - .../core/database/redis_db/profile_handler.py | 169 ++---------------- .../core/database/sqlite_db/database.py | 6 +- 4 files changed, 41 insertions(+), 306 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 124ec61f91..c57a7a3581 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,9 +5,13 @@ import json import datetime import traceback -import sys +import warnings + -from slips_files.common.imports import * +from slips_files.common.state_handler import get_final_state_from_flags +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, @@ -29,8 +33,6 @@ def warn(*args, **kwargs): pass -import warnings - warnings.warn = warn @@ -113,141 +115,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -281,12 +148,17 @@ def process_features(self, dataset): except ValueError: pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -370,7 +242,7 @@ def process_flows(self): try: # We get all the flows so far # because this retraining happens in batches - flows = self.db.get_all_flows() + flows: list = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware @@ -464,7 +336,7 @@ def detect(self): """ try: # Store the real label if there is one - y_flow = self.flow["label"] + # y_flow = self.flow["label"] # remove the real label column self.flow = self.flow.drop("label", axis=1) # remove the label predictions column of the other modules @@ -568,13 +440,10 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): data = msg["data"] - # Convert from json to dict data = json.loads(data) - profileid = data["profileid"] + # profileid = data["profileid"] twid = data["twid"] - # Get flow that is now in json format flow = data["flow"] - # Convert flow to a dict flow = json.loads(flow) # Convert the common fields to something that can # be interpreted diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index fe7b02d046..f1ef1290c7 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -569,9 +569,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index d785b51c98..23c23d3d42 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -14,7 +14,7 @@ import redis import validators - +from slips_files.common.state_handler import get_final_state_from_flags from slips_files.common.abstracts.observer import IObservable from slips_files.core.output import Output @@ -324,14 +324,15 @@ def add_port( state_hist = flow.state_hist if hasattr(flow, "state_hist") else "" if "^" in state_hist: - # The majority of the FP with horizontal port scan detection happen because a - # benign computer changes wifi, and many not established conns are redone, - # which look like a port scan to 10 webpages. To avoid this, we IGNORE all - # the flows that have in the history of flags (field history in zeek), the ^, + # The majority of the FP with horizontal port scan detection + # happen because a benign computer changes wifi, and many not + # established conns are redone, which look like a port scan to + # 10 webpages. To avoid this, we IGNORE all the flows that have + # in the history of flags (field history in zeek), the ^, # that means that the flow was swapped/flipped. - # The below key_name is only used by the portscan module to check for horizontal - # portscan, which means we can safely ignore it here and it won't affect the rest - # of slips + # The below key_name is only used by the portscan module to + # check for horizontal portscan, which means we can safely + # ignore it here and it won't affect the rest of slips return False # Choose which port to use based if we were asked Dst or Src @@ -342,10 +343,10 @@ def add_port( ip_key = "srcips" if role == "Server" else "dstips" # Get the state. Established, NotEstablished - summaryState = self.get_final_state_from_flags(state, pkts) + state = get_final_state_from_flags(state, pkts) old_profileid_twid_data = self.get_data_from_profile_tw( - profileid, twid, port_type, summaryState, proto, role, "Ports" + profileid, twid, port_type, state, proto, role, "Ports" ) try: @@ -355,7 +356,8 @@ def add_port( port_data["totalpkt"] += pkts port_data["totalbytes"] += totbytes - # if there's a conn from this ip on this port, update the pkts of this conn + # if there's a conn from this ip on this port, update the pkts + # of this conn if ip in port_data[ip_key]: port_data[ip_key][ip]["pkts"] += pkts port_data[ip_key][ip]["spkts"] += spkts @@ -386,145 +388,10 @@ def add_port( old_profileid_twid_data[port] = port_data data = json.dumps(old_profileid_twid_data) hash_key = f"{profileid}{self.separator}{twid}" - key_name = f"{port_type}Ports{role}{proto}{summaryState}" + key_name = f"{port_type}Ports{role}{proto}{state}" self.r.hset(hash_key, key_name, str(data)) self.mark_profile_tw_as_modified(profileid, twid, starttime) - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) - def get_data_from_profile_tw( self, profileid: str, @@ -722,14 +589,14 @@ def add_ips(self, profileid, twid, flow, role): self.update_times_contacted(ip, direction, profileid, twid) # Get the state. Established, NotEstablished - summaryState = self.get_final_state_from_flags(flow.state, flow.pkts) - key_name = f"{direction}IPs{role}{flow.proto.upper()}{summaryState}" + state = get_final_state_from_flags(flow.state, flow.pkts) + key_name = f"{direction}IPs{role}{flow.proto.upper()}{state}" # Get the previous data about this key old_profileid_twid_data = self.get_data_from_profile_tw( profileid, twid, direction, - summaryState, + state, flow.proto, role, "IPs", @@ -806,7 +673,7 @@ def add_flow( The profileid is the main profile that this flow is related too. : param new_profile_added : is set to True for everytime we see a new srcaddr """ - summary_state = self.get_final_state_from_flags(flow.state, flow.pkts) + summary_state = get_final_state_from_flags(flow.state, flow.pkts) flow_dict = { "ts": flow.starttime, "dur": flow.dur, diff --git a/slips_files/core/database/sqlite_db/database.py b/slips_files/core/database/sqlite_db/database.py index 4792ea67c9..4dd52dbfc1 100644 --- a/slips_files/core/database/sqlite_db/database.py +++ b/slips_files/core/database/sqlite_db/database.py @@ -31,11 +31,13 @@ def connect(self): """ db_newly_created = False if not os.path.exists(self._flows_db): - # db not created, mark it as first time accessing it so we can init tables once we connect + # db not created, mark it as first time accessing it so we can + # init tables once we connect db_newly_created = True self._init_db() - # you can get multithreaded access on a single pysqlite connection by passing "check_same_thread=False" + # you can get multithreaded access on a single pysqlite connection + # by passing "check_same_thread=False" self.conn = sqlite3.connect( self._flows_db, check_same_thread=False, timeout=20 ) From f0eb12f0053b15d98e426a5459374d82f2919807 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:36:55 +0200 Subject: [PATCH 005/455] mlflow. Ignore UID column --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c57a7a3581..e2aa1e0ee3 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -141,6 +141,7 @@ def process_features(self, dataset): "dpkts", "smac", "dmac", + "uid", ] for field in to_drop: try: From 6bc8351cf891c12bf16f4d298a8f3f50c0506850 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 006/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9e0aa772cd..c8226368c7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -11,6 +11,7 @@ import datetime import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -133,7 +269,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -156,15 +292,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From 5489ab209a6c96f03f8afd73c7ce7f31a78382f2 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 007/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 3776649a496c4b2b40962752b19f961e047f21bd Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 008/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From f75e88b9a312b22ca6b14af438bd43a0a428a36c Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 009/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c7..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 98651fd08c..2f81ecd8ef 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -607,9 +607,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 47193d79912875918ab9e5612b617b3c4ec42886 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 010/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9e0aa772cd..c8226368c7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -11,6 +11,7 @@ import datetime import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -133,7 +269,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -156,15 +292,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From 0de55cb022a1c84cc642febf6383e9d314510a23 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 011/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From cfa52224d7aee90e9ce0cf5e68625360564b3181 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 012/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 8d14ef8a2803807e785f1bc4222ea2f391dd46e1 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 013/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c7..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 9ecc16635f..4de72c756f 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -607,9 +607,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From fc735e7374e409de67c16e6a4b6e392efbc5d603 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 014/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..94eb27afdf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 45ea08b585d956c0fb483cf789ec974111f0d6b5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 015/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 94eb27afdf..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From a3ff70d540ab30bd35686eb5c4b338bbff17aa25 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 016/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3a3b0a72fe..8917fef6a5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -120,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -132,7 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -155,15 +291,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From 385d1e2cf142e677602dc80c94d9ecd5c6c0896b Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 017/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From d8783fd1d1b85e1e39aa4d2b05520a95463e52da Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 018/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 242ab4633538e6632d8418cf5df33469d8dfc585 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 019/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 8917fef6a5..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index d0d586c4c0..e0028e813b 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -610,9 +610,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 06d18ac0a03710092ed0be96eeec10cf89cb2ecf Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 020/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..12c3589edc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 22731c9987bee24e4848658b095918ebd40ffdc0 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 021/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589edc..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 88dd4e6a6a527f021269be0c022f403b1ba23961 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 022/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..12c3589edc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5b81164e3e7be0145975fbe7016021614bdeafd5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 023/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589edc..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From c0c5e537c723578ceae0ec4002b25d882d37ec36 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 024/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..12c3589edc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 48cf9d05e63b9d09e44536dc77da6553118561ed Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 025/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589edc..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e2aaf16170aefd3350c32122988f29633454c260 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 026/455] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e0028e813b..d0d586c4c0 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -610,6 +610,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 5b87d35ad971e343d73daa846350d6277682e3ba Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 027/455] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..286a397eff 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 08dec989d4d1bb54ecf4922f294bcbee5c264ab3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 028/455] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115bd..0000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 0c4455c108509246993c8aa081310f9c0ce5a240 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 029/455] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 286a397eff..fac5e674f9 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From 3429549c6326c9c7d7b9bc299fef48d6b754fb48 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 030/455] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fac5e674f9..e6ea0b5171 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", ] for field in fields_to_drop: @@ -343,7 +347,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -435,18 +439,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -459,23 +461,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -495,8 +505,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -504,9 +514,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From a779358bb3a6f8d72446c45c8b3feaf1406c87f4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 031/455] Fix the profiler handler for cases of nan in state --- .../core/database/redis_db/profile_handler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 0489372cdc..1ea7644648 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -379,7 +379,12 @@ def get_final_state_from_flags(self, state, pkts): We receive the pakets to distinguish some Reset connections """ try: - pre = state.split("_")[0] + # In some flows the state is a nan + try: + pre = state.split("_")[0] + except AttributeError: + pre = '' + try: # Try suricata states """ @@ -401,7 +406,11 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - suf = state.split("_")[1] + # In some flows the state is a nan + try: + suf = state.split("_")[1] + except AttributeError: + suf = '' if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -502,7 +511,7 @@ def get_final_state_from_flags(self, state, pkts): except Exception: exception_line = sys.exc_info()[2].tb_lineno self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", + f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}", 0, 1, ) From 223d72d0948098bb30f3a0992ac978f2249a9c35 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:20:42 +0100 Subject: [PATCH 032/455] slips.yaml. Update to have correct labels. By default test. Defaul training lbel is benign --- config/slips.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index 34f41e7109..31847a6df4 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -106,13 +106,12 @@ parameters: deletePrevdb: true # Set the label for all the flows that are being read. - # For now only normal and malware directly. No option for setting labels - # with a filter + # For now only Benign and Malicious (Capitalized) # The purpose is to be used in the training of ML models and to output # flows with labels for other tools. - # label: malicious - # label: unknown - label: normal + # label: Malicious + # label: Benign + label: Benign # If Zeek files are rotated or not to avoid running out of disk. # Zeek rotation is enabled by default when using an interface, # which means Slips will delete all Zeek log files after 1 day From 18b9a9559b08a4675248e2437eae7b271ab9ec94 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:21:21 +0100 Subject: [PATCH 033/455] First ipython to tst ML flow related models --- modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb new file mode 100644 index 0000000000..d726cd2805 --- /dev/null +++ b/modules/flowmldetection/flowmlanalysis.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of Flows with Machine Learning for Slips" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis of a fixed list of flows to try techniques and find parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "import pickle\n", + "import pandas as pd\n", + "import json\n", + "import traceback\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "slips-new", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 5c89e4db5a40fda5b1cce21996c684d36c93d667 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 034/455] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e6ea0b5171..0fa1e4d767 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From ebbfd953cb028cf9ef0b75cd17168fc70f6921b0 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 035/455] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0fa1e4d767..5c5f9943f1 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From 13287d134eb09ac30dcb0e056d5465544d545591 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:26:42 +0100 Subject: [PATCH 036/455] profile_handler. Small bug in how we handled the profiles, we were using 'in' instead of == for established. Some not established MAY not have been correctly captured --- slips_files/core/database/redis_db/profile_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 1ea7644648..85fdec5a63 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -393,9 +393,10 @@ def get_final_state_from_flags(self, state, pkts): these are: New, Established and Closed,for UDP only new and established. For each of these states Suricata can employ different timeouts. """ - if "new" in state or "established" in state: + # This is controversial, but if we dont have a good state, we consider it not established for now + if "new" in state or state.lower() == "established": return "Established" - elif "closed" in state: + elif "closed" in state or state.lower() == 'not established': return "Not Established" # We have varius type of states depending on the type of flow. @@ -406,7 +407,6 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From 9588762aa88da736012b2b6f5844f3ca0c39f15c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 037/455] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1073 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 411 zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice! z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWpQXRJ|Emr(~!@tY-9P0&;YE zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1 z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8 zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5 delta 380 zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN zW*@ diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch literal 890 zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD< z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*|B)5DrlQdy7+(!%6#F{QHuBFo&v zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~ zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E; z8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_ delta 525 zcmeyxHj9S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5mhSAF{H=RwvmLf> z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%YfQ^>gH&y>a4SnPazubB5!cqc1YnCGT))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4d&dkpP QMw@}2k%^wsl+q+U0P2|NZU6uP From 531946f0f0d880cc68dd95991a08b387b6a78c39 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 038/455] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 039/455] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5c5f9943f1..fe950ed4bb 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "endtime", "flow_source", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 473b0958153803624f757ac7b3bb85ffb9d68930 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 040/455] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 041/455] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 9efd09bf3ca9fc3de4899135fd286db07b8df3d8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 042/455] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 978f87cb89f5ec6dfdea380afb76aa952b77bb38 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 043/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e44ac83f4d..16b67e9038 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -120,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -132,7 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -155,15 +291,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From a5dd40500fc88636982bcbb9dd8bf05803dbb3cc Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 044/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 3579edc92ec3832c3116a3180af419029cb89b66 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 045/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 47d65ed1ef5545777e0aef73e13ba14dd231b51c Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 046/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 16b67e9038..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf62..b4b2128d3d 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 55ce0bbf1fdb8ae5ebeea066fd3efe07cab9a0b8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 047/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From a4446a54dfcb2299392a2e3a59d0d755de693153 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 048/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From eead7b56753dba0923ef0ac41be1e3361ec70cd3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 049/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 43ab23bbd9a7699efed3d731d83df88e53afa451 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 050/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From df9417d840129cee1864a0a86c6ef33e82db1038 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 051/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From f5c4e0c67f148ad3f312ecea5801af9fd28a1877 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 052/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From eda2d83e77f223bc8e436aae6bd17b7eb6c83ece Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 053/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From ed1997e45fbed8609b9ffb8787dbb61059d5d7a2 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 054/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From c0d8b16d7fc7c2a404c5d0f3c18768bebd49aa0f Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 055/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 00e2ab175a7d00fca76d09433be4df6141ff4316 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 056/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From ee417b90a570747d57c2bffc75ec54f0c3e22c73 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 057/455] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index b4b2128d3d..e8ca3aaf62 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 774e03dc5598ccd9627fd1e1aece3b9e883f38fa Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 058/455] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..c06755a599 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 6220c230c86e0cbfd8148829e684335cc62f2a8e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 059/455] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115bd..0000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 4d2dd99cbec81de085e35ce087ab8ac634908768 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 060/455] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c06755a599..87e07c7592 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From f0e53cfc658c31e6046ed2cf4741819c89517576 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 061/455] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 87e07c7592..e91495d649 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", @@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -437,18 +441,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -461,23 +463,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -497,8 +507,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -506,9 +516,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From b7e82cf985596d60b66ee7ac7d2a7052a0b986dc Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 062/455] Fix the profiler handler for cases of nan in state --- .../core/database/redis_db/profile_handler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 0489372cdc..1ea7644648 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -379,7 +379,12 @@ def get_final_state_from_flags(self, state, pkts): We receive the pakets to distinguish some Reset connections """ try: - pre = state.split("_")[0] + # In some flows the state is a nan + try: + pre = state.split("_")[0] + except AttributeError: + pre = '' + try: # Try suricata states """ @@ -401,7 +406,11 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - suf = state.split("_")[1] + # In some flows the state is a nan + try: + suf = state.split("_")[1] + except AttributeError: + suf = '' if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -502,7 +511,7 @@ def get_final_state_from_flags(self, state, pkts): except Exception: exception_line = sys.exc_info()[2].tb_lineno self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", + f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}", 0, 1, ) From ccde23ede2ac27f38809ce5f1bf2e5518c1d73c1 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:20:42 +0100 Subject: [PATCH 063/455] slips.yaml. Update to have correct labels. By default test. Defaul training lbel is benign --- config/slips.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index f7089b41af..8736eaf511 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -106,13 +106,12 @@ parameters: deletePrevdb: true # Set the label for all the flows that are being read. - # For now only normal and malware directly. No option for setting labels - # with a filter + # For now only Benign and Malicious (Capitalized) # The purpose is to be used in the training of ML models and to output # flows with labels for other tools. - # label: malicious - # label: unknown - label: normal + # label: Malicious + # label: Benign + label: Benign # If Zeek files are rotated or not to avoid running out of disk. # Zeek rotation is enabled by default when using an interface, # which means Slips will delete all Zeek log files after 1 day From 667faa3f1bc572053f530e0e8b3e8ca40ef19976 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:21:21 +0100 Subject: [PATCH 064/455] First ipython to tst ML flow related models --- modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb new file mode 100644 index 0000000000..d726cd2805 --- /dev/null +++ b/modules/flowmldetection/flowmlanalysis.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of Flows with Machine Learning for Slips" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis of a fixed list of flows to try techniques and find parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "import pickle\n", + "import pandas as pd\n", + "import json\n", + "import traceback\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "slips-new", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From ad1488054068bc9d5bc3b596f04248523ef42a83 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 065/455] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e91495d649..58b4ce1e4c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From 02804ca94b80f7a24374b36ec073af55aa272c3c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 066/455] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 58b4ce1e4c..4a4d46e376 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From dea7702d8b5518fc4fc2d2fd5262e45c0ddec65d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:26:42 +0100 Subject: [PATCH 067/455] profile_handler. Small bug in how we handled the profiles, we were using 'in' instead of == for established. Some not established MAY not have been correctly captured --- slips_files/core/database/redis_db/profile_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 1ea7644648..85fdec5a63 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -393,9 +393,10 @@ def get_final_state_from_flags(self, state, pkts): these are: New, Established and Closed,for UDP only new and established. For each of these states Suricata can employ different timeouts. """ - if "new" in state or "established" in state: + # This is controversial, but if we dont have a good state, we consider it not established for now + if "new" in state or state.lower() == "established": return "Established" - elif "closed" in state: + elif "closed" in state or state.lower() == 'not established': return "Not Established" # We have varius type of states depending on the type of flow. @@ -406,7 +407,6 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From f7f2eb3b80d90e0dc31d3fcfe7394d11650f84f4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 068/455] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1073 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 411 zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice! z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWpQXRJ|Emr(~!@tY-9P0&;YE zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1 z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8 zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5 delta 380 zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN zW*@ diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch literal 890 zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD< z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*|B)5DrlQdy7+(!%6#F{QHuBFo&v zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~ zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E; z8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_ delta 525 zcmeyxHj9S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5mhSAF{H=RwvmLf> z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%YfQ^>gH&y>a4SnPazubB5!cqc1YnCGT))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4d&dkpP QMw@}2k%^wsl+q+U0P2|NZU6uP From 81b103d0dd8ef69f3cadec4ff92e8e6bbe2c0027 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 069/455] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 070/455] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4a4d46e376..d8e9ada27c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 10fee830a3fecf11002d3037e75d8c094d72b4c8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 071/455] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 072/455] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From cbe0718e114b9413874ab6ccccb42da441dee2c4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 073/455] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From cf6b939823f5d935a8afa647bb21c3d86d353aa9 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:49:23 +0000 Subject: [PATCH 074/455] Add plot for flowml train scores --- modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py new file mode 100644 index 0000000000..0b5b5b72ba --- /dev/null +++ b/modules/flowmldetection/plot_train_score.py @@ -0,0 +1,56 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re +import sys + +def plot_log_data(file_path): + # Read the log data from the file + with open(file_path, 'r') as file: + log_data = file.read() + + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + + # Parse the log file + data = re.findall(pattern, log_data) + + # Convert data to a DataFrame + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + df = df.astype({ + "Background": int, + "Benign": int, + "Malicious": int, + "Total labels": float, + "Score": float + }) + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plotting Score on the left y-axis + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + ax1.set_xlabel('Index') + ax1.set_ylabel('Score', color='tab:blue') + ax1.tick_params(axis='y', labelcolor='tab:blue') + + # Create the second y-axis for the Total labels + ax2 = ax1.twinx() + ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') + ax2.set_ylabel('Total labels', color='tab:red') + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Adding title and legend + plt.title('Log Data Visualization') + fig.tight_layout() + + # Save plot to a PNG file + plt.savefig('log_data_plot_with_two_scales.png') + + # Display the plot + plt.show() + +# Make sure the file path is passed as an argument +if len(sys.argv) < 2: + print("Please provide the path to the log file as a parameter.") +else: + plot_log_data(sys.argv[1]) From 2966b1497c0fef94d4f9daccecfd1d5a9fd66691 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:04 +0000 Subject: [PATCH 075/455] Add a log file to store the training data output --- modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d8e9ada27c..f9a303c1ba 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -68,12 +68,29 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves self.label = conf.label() - def train(self): + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): """ Train a model based on the flows we receive and the labels """ From f817b6dcfb79c98ca770649447d788ad7bf0f50f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:32 +0000 Subject: [PATCH 076/455] Store data in the log file of training --- modules/flowmldetection/flowmldetection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f9a303c1ba..e97f4de535 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -137,9 +137,13 @@ def train(self, sum_labeled_flows): # Store the models on disk self.store_model() + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ From 656264d4ddf93b7f3588202b0f88394a5fae4ca4 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:53 +0000 Subject: [PATCH 077/455] better comments --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e97f4de535..3aa030790f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -59,10 +59,9 @@ def init(self): self.minimum_labels_to_start_train = 50 # Minum amount of new labels needed to retrain self.minimum_labels_to_retrain = 50 - # The number of flows when last trained + # The number of flows when last trained. Used internally only to know + # when to retrain self.last_number_of_flows_when_trained = 0 - # To plot the scores of training - # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" From e33862c2792b964556310abe33b938ca6864d9e1 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:30 +0000 Subject: [PATCH 078/455] Fix issue not dropping detailed labels --- modules/flowmldetection/flowmldetection.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3aa030790f..4b05c9b47a 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -94,23 +94,19 @@ def train(self, sum_labeled_flows): Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Train try: self.clf.partial_fit( From ce583a878fa64066e593cb802aae70057db81122 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:53 +0000 Subject: [PATCH 079/455] Fix issue that not all labels sere given to the partial fit --- modules/flowmldetection/flowmldetection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4b05c9b47a..f12bfaaa66 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -109,8 +109,9 @@ def train(self, sum_labeled_flows): # Train try: + # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") From 8ec673f039e86599b1260e3a97d7658c0aa81ac5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:52:08 +0000 Subject: [PATCH 080/455] count partial labels in this epoch --- modules/flowmldetection/flowmldetection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f12bfaaa66..0fffda271b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -106,6 +106,12 @@ def train(self, sum_labeled_flows): # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } # Train try: From d4b39eea28c5ff30c1a5ee10ec7c3e874cbaa5bf Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:09 +0000 Subject: [PATCH 081/455] Dont print training in screen --- modules/flowmldetection/flowmldetection.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0fffda271b..f374c2926f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -126,15 +126,8 @@ def train(self, sum_labeled_flows): # See score so far in training score = self.clf.score(X_flow, y_flow) - # To debug the training score - # self.scores.append(score) - - self.print(f" Training Score: {score}", 0, 1) - # self.print(f' Model Parameters: {self.clf.coef_}') - - # Debug code to store a plot in a png of the scores - # plt.plot(self.scores) - # plt.savefig('train-scores.png') + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) # Store the models on disk self.store_model() From a2d50c96523ce3f3e344e813286b0653854007cf Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:28 +0000 Subject: [PATCH 082/455] Add function to write to train log --- modules/flowmldetection/flowmldetection.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f374c2926f..679e7c0cc9 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -247,28 +247,28 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_training_flows(self, last_number_of_flows_when_trained): """ - Process all the flows in the DB + Process only the new flows in the DB since the last training. Store the pandas df in self.flows """ try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + # We get all the flows so far - # because this retraining happens in batches flows = self.db.get_all_flows() - # Check how many different labels are in the DB - # We need both normal and malware + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB labels = self.db.get_labels() if len(labels) == 1: - # Only 1 label has flows - # There are not enough different labels, so insert two flows - # that are fake but representative of a normal and malware flow - # they are only for the training process - # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. - flows.append( + # Insert fake flows for both classes if needed + new_flows.append( { "starttime": 1594417039.029793, "dur": "1.9424750804901123", From 7e6325dab56e081fbb88ec996572fa4bea30e464 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:27 +0000 Subject: [PATCH 083/455] Fix label in dummy flow --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 679e7c0cc9..95c9b82a74 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "ground_truth_label": "Malicious", "module_labels": { "flowalerts-long-connection": "Malicious" }, } ) - flows.append( + new_flows.append( { "starttime": 1382355032.706468, "dur": "10.896695", From 683d7c17e081820b4df383742c7d481442801188 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:39 +0000 Subject: [PATCH 084/455] Fix dummy flow --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 95c9b82a74..5ea48fbc40 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "label": "Benign", + "ground_truth_label": "Benign", "module_labels": { "flowalerts-long-connection": "Benign" }, From 26a1482c18bdc02ed46b815b60ba720200fafa8e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:58:28 +0000 Subject: [PATCH 085/455] Rename variable --- modules/flowmldetection/flowmldetection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5ea48fbc40..ff68b8a270 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): }, } ) - # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(flows) + df_flows = pd.DataFrame(new_flows) # Process features df_flows = self.process_features(df_flows) @@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: - # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) From 34b754a257ce29bc1abe83c883dcd9b6a4076e35 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:32 +0000 Subject: [PATCH 086/455] Fix dummy flow label --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index ff68b8a270..6b41b40298 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -356,7 +356,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", - "ground_truth_label", # todo now we can use them + "ground_truth_label", "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. From 54f958d42542ed7041fbe43584deb422ac46c591 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:47 +0000 Subject: [PATCH 087/455] Pass values to train function --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6b41b40298..4d66aab855 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -521,9 +521,9 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows() + self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train() + self.train(sum_labeled_flows) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From a9236e6297c888d029bab09ffecb7270c0c9914a Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:01:47 +0000 Subject: [PATCH 088/455] import os --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4d66aab855..766178e127 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import os from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 3fe1eaf3d4d4a7446dfee38749eb7349254e38ae Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:15 +0000 Subject: [PATCH 089/455] Get issue of total flows zero --- slips_files/core/database/database_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf62..892b923b4a 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -661,7 +661,8 @@ def add_software_to_profile(self, *args, **kwargs): return self.rdb.add_software_to_profile(*args, **kwargs) def get_total_flows(self, *args, **kwargs): - return int(self.rdb.get_total_flows(*args, **kwargs)) + total_flows = self.rdb.get_total_flows(*args, **kwargs) + return int(total_flows) if total_flows is not None else 0 def increment_processed_flows(self, *args, **kwargs): return self.rdb.increment_processed_flows(*args, **kwargs) From 73a19e5a500dd615b7e991733a471a3e9ec9aa6c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:32 +0000 Subject: [PATCH 090/455] Add comments --- slips_files/core/database/database_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 892b923b4a..6dd1d9952e 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -879,7 +879,10 @@ def get_flow(self, *args, **kwargs): """returns the raw flow as read from the log file""" return self.sqlite.get_flow(*args, **kwargs) - def add_flow(self, flow, profileid: str, twid: str, label="benign"): + def add_flow(self, flow, profileid: str, twid: str, label="Benign"): + """ + Just in case, by default if there are no labels in the flow, we consider it Benign + """ # stores it in the db self.sqlite.add_flow(flow, profileid, twid, label=label) # handles the channels and labels etc. From 3e3443af1b2ec4c6d91acf4ed69c76a6928696b7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:51 +0000 Subject: [PATCH 091/455] Rename var name to be more clear --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index a05557b9f2..c0a4261891 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -119,7 +119,7 @@ def read_configuration(self): self.local_whitelist_path = conf.local_whitelist_path() self.timeformat = conf.ts_format() self.analysis_direction = conf.analysis_direction() - self.label = conf.label() + self.configuration_label = conf.label() self.width = conf.get_tw_width_as_float() self.client_ips: List[ Union[IPv4Network, IPv6Network, IPv4Address, IPv6Address] From 4c3c3149d67b1dcf2d573a4879fcaab0078f971f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:10 +0000 Subject: [PATCH 092/455] Rename var name --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index c0a4261891..42bf3355e2 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -377,7 +377,7 @@ def store_features_going_in(self, profileid: str, twid: str, flow): flow, profileid=profileid, twid=twid, - label=self.label, + label=self.configuration_label, ) self.db.mark_profile_tw_as_modified(profileid, twid, "") From 18b7544ce9c6554b1bf95c4d7d19458df01f4105 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:31 +0000 Subject: [PATCH 093/455] Fix processeed flows being zero --- slips/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips/main.py b/slips/main.py index bd7890f5bc..d960ce318b 100644 --- a/slips/main.py +++ b/slips/main.py @@ -414,7 +414,7 @@ def get_analyzed_flows_percentage(self) -> str: self.total_flows = self.db.get_total_flows() flows_percentage = int( - (self.db.get_processed_flows_so_far() / self.total_flows) * 100 + (self.db.get_processed_flows_so_far() / self.total_flows) * 100 if self.total_flows != 0 else 0 ) return f"Analyzed Flows: {green(flows_percentage)}{green('%')}. " From c221fe75a1a8027f86a35e8080165d37dde8da97 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:53 +0000 Subject: [PATCH 094/455] Delete old comments --- modules/flowmldetection/flowmldetection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 766178e127..6c3bfc1275 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -28,10 +28,6 @@ Method, ) -# Only for debbuging -# from matplotlib import pyplot as plt - - # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass From 320e0fedf1ebed269a1c369e6716bb1440a94eca Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:13:22 +0000 Subject: [PATCH 095/455] Fix plots --- modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 0b5b5b72ba..359df04eff 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -2,6 +2,8 @@ import matplotlib.pyplot as plt import re import sys +import argparse +import os def plot_log_data(file_path): # Read the log data from the file @@ -24,33 +26,59 @@ def plot_log_data(file_path): "Score": float }) + # Get the directory of the log file to store the plot in the same folder + dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Total labels + # Create the second y-axis for the Background, Benign, Malicious, Total labels ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() - # Save plot to a PNG file - plt.savefig('log_data_plot_with_two_scales.png') + # Adding the legend with increased space for readability + ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') + ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.75) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) # Display the plot plt.show() -# Make sure the file path is passed as an argument -if len(sys.argv) < 2: - print("Please provide the path to the log file as a parameter.") -else: - plot_log_data(sys.argv[1]) +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help + args = parser.parse_args() + + # Call the function to process the log file + plot_log_data(args.log_file) + +if __name__ == "__main__": + main() From 1adc33a6d6de83ef13cad648ea6ccfb9f6ceda02 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:14:58 +0000 Subject: [PATCH 096/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 359df04eff..c7f374a7fe 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -40,18 +40,21 @@ def plot_log_data(file_path): ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Background, Benign, Malicious, Total labels + # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() From 010fbcda3c6183a3a309726519519ffcd0b61927 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:16:23 +0000 Subject: [PATCH 097/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index c7f374a7fe..4099c47c1e 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -42,10 +42,10 @@ def plot_log_data(file_path): # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') # Set appropriate scale for right y-axis based on the data ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) @@ -56,7 +56,7 @@ def plot_log_data(file_path): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') # Adding title and legend - plt.title('Log Data Visualization') + plt.title('Training performance') fig.tight_layout() # Adding the legend with increased space for readability From 978eaa02e2d48e6d27ab1c579a90b8a21b666b41 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:24:43 +0000 Subject: [PATCH 098/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 4099c47c1e..8437e968ac 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -59,12 +59,12 @@ def plot_log_data(file_path): plt.title('Training performance') fig.tight_layout() - # Adding the legend with increased space for readability - ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') - ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) + plt.subplots_adjust(right=0.7) # Save plot to the same folder as the log file plt.savefig(plot_file) From 3571750a84fc29ee775f42eb9b90851818defa56 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:02:34 +0000 Subject: [PATCH 099/455] Plot testing performance from a log --- .../plot_testing_performance.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 modules/flowmldetection/plot_testing_performance.py diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py new file mode 100644 index 0000000000..a38c7f0598 --- /dev/null +++ b/modules/flowmldetection/plot_testing_performance.py @@ -0,0 +1,89 @@ +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Create the plot + plt.figure(figsize=(12, 8)) + + # Plot each metric + plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') + plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') + plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') + plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') + plt.plot(F1_values, label='F1 Score', marker='o') + plt.plot(accuracy_values, label='Accuracy', marker='o') + plt.plot(precision_values, label='Precision', marker='o') + plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') + plt.plot(recall_values, label='Recall (TPR)', marker='o') + + # Add labels and title + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title('Evaluation Metrics Over Time') + + # Add a legend + plt.legend() + + # Save the plot as a PNG file + plt.savefig('metrics_plot.png') + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 1c0ea51fad5afbd9753a1d52c5369baca086a7d3 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:04:32 +0000 Subject: [PATCH 100/455] Fix the plot --- modules/flowmldetection/plot_testing_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index a38c7f0598..fac0acd64a 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Set logarithmic scale on the y-axis + plt.yscale('log') + # Add labels and title plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title('Evaluation Metrics Over Time') + plt.ylabel('Metric Value (Log Scale)') + plt.title('Evaluation Metrics Over Time (Log Scale)') # Add a legend plt.legend() # Save the plot as a PNG file - plt.savefig('metrics_plot.png') + plt.savefig('metrics_plot_log_scale.png') plt.close() def main(): From 1bcca14a5068fbb68c8a38962f7b995314cc65d7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:12:40 +0000 Subject: [PATCH 101/455] Fix the plots --- .../plot_testing_performance.py | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index fac0acd64a..5581c72cd4 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -50,33 +50,66 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Create the plot - plt.figure(figsize=(12, 8)) + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } - # Plot each metric - plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') - plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') - plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') - plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') - plt.plot(F1_values, label='F1 Score', marker='o') - plt.plot(accuracy_values, label='Accuracy', marker='o') - plt.plot(precision_values, label='Precision', marker='o') - plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') - plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') - # Set logarithmic scale on the y-axis - plt.yscale('log') + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + +def plot_single_group(metrics_dict, output_filename): + plt.figure(figsize=(12, 8)) - # Add labels and title + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + plt.xlabel('Index') - plt.ylabel('Metric Value (Log Scale)') - plt.title('Evaluation Metrics Over Time (Log Scale)') - - # Add a legend + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') plt.legend() - # Save the plot as a PNG file - plt.savefig('metrics_plot_log_scale.png') + # Save the plot + plt.savefig(output_filename) plt.close() def main(): @@ -85,6 +118,7 @@ def main(): sys.exit(1) file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) From ab4bcd82169f802615ea28755e6735a0c611e2e7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:16:50 +0000 Subject: [PATCH 102/455] Fix plot --- .../plot_testing_performance.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 5581c72cd4..8f9e12cd86 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') -def plot_single_group(metrics_dict, output_filename): + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename): # Apply log scale by default plt.yscale('log') + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Manually set more Y-ticks for better visibility + plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 + plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + plt.xlabel('Index') plt.ylabel('Metric Value') plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') From b7e0c6f6b4cecc6a446dc322e320183999092fb6 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:22 +0000 Subject: [PATCH 103/455] Fix plots --- modules/flowmldetection/flowmldetection.py | 709 +++++---------------- 1 file changed, 143 insertions(+), 566 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6c3bfc1275..37f0761109 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,566 +1,143 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy -from sklearn.linear_model import SGDClassifier -from sklearn.preprocessing import StandardScaler -import pickle -import pandas as pd -import json -import traceback -import warnings -import os - -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( - Evidence, - ProfileID, - TimeWindow, - Attacker, - ThreatLevel, - EvidenceType, - IoCType, - Direction, - Victim, - Method, -) - -# This horrible hack is only to stop sklearn from printing those warnings -def warn(*args, **kwargs): - pass - - -warnings.warn = warn - - -class FlowMLDetection(IModule): - # Name: short name of the module. Do not use spaces - name = "Flow ML Detection" - description = ( - "Train or test a Machine Learning model to detect malicious flows" - ) - authors = ["Sebastian Garcia"] - - def init(self): - # Subscribe to the channel - self.c1 = self.db.subscribe("new_flow") - self.channels = {"new_flow": self.c1} - self.fieldseparator = self.db.get_field_separator() - # Set the output queue of our database instance - # Read the configuration - self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 - # The scaler trained during training and to use during testing - self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - - def read_configuration(self): - conf = ConfigParser() - self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - - def train(self, sum_labeled_flows): - """ - Train a model based on the flows we receive and the labels - """ - try: - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels - X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - - # Normalize this batch of data so far. This can get progressivle slow - X_flow = self.scaler.fit_transform(X_flow) - - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - - # Train - try: - # Online incremental learning - self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] - ) - except Exception: - self.print("Error while calling clf.train()") - self.print(traceback.format_exc(), 0, 1) - - # See score so far in training - score = self.clf.score(X_flow, y_flow) - - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) - - # Store the models on disk - self.store_model() - - # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") - except Exception: - self.print("Error in train().", 0, 1) - self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") - - def process_features(self, dataset): - """ - Discards some features of the dataset and can create new. - Clean the dataset - """ - try: - # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] - for proto in to_discard: - dataset = dataset[dataset.proto != proto] - - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these - to_drop = [ - "appproto", - "daddr", - "saddr", - "starttime", - "type_", - "smac", - "dmac", - "history", - "uid", - "dir_", - "endtime", - "flow_source", - ] - for field in to_drop: - try: - dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): - pass - - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others - # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) - - # Convert state to categorical - dataset.state = dataset.state.str.replace( - r"(^.*Not Established.*$)", "0", regex=True - ) - dataset.state = dataset.state.str.replace( - r"(^.*Established.*$)", "1", regex=True - ) - - # Convert categories to floats - dataset.state = dataset.state.astype("float64") - - # Convert proto to categorical. For now we only have few states, so we can hardcode... - # We dont use the data to create categories because in testing mode - # we dont see all the protocols - # Also we dont store the Categorizer because the user can retrain - # with its own data. - dataset.proto = dataset.proto.str.lower() - dataset.proto = dataset.proto.str.replace( - r"(^.*tcp.*$)", "0", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*udp.*$)", "1", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp.*$)", "2", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp-ipv6.*$)", "3", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*arp.*$)", "4", regex=True - ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_float: - try: - field = field.astype("float64") - except (ValueError, AttributeError): - pass - - return dataset - except Exception: - # Stop the timer - self.print("Error in process_features()") - self.print(traceback.format_exc(), 0, 1) - - def process_training_flows(self, last_number_of_flows_when_trained): - """ - Process only the new flows in the DB since the last training. - Store the pandas df in self.flows - """ - try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - - # We get all the flows so far - flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB - labels = self.db.get_labels() - if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( - { - "starttime": 1594417039.029793, - "dur": "1.9424750804901123", - "saddr": "10.7.10.101", - "sport": "49733", - "daddr": "40.70.224.145", - "dport": "443", - "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, - "sbytes": 25517, - "dbytes": 17247, - "appproto": "ssl", - "ground_truth_label": "Malicious", - "module_labels": { - "flowalerts-long-connection": "Malicious" - }, - } - ) - new_flows.append( - { - "starttime": 1382355032.706468, - "dur": "10.896695", - "saddr": "147.32.83.52", - "sport": "47956", - "daddr": "80.242.138.72", - "dport": "80", - "proto": "tcp", - "state": "SF", - "spkts": 1, - "dpkts": 0, - "sbytes": 100, - "dbytes": 67596, - "appproto": "http", - "ground_truth_label": "Benign", - "module_labels": { - "flowalerts-long-connection": "Benign" - }, - } - ) - - # Convert to pandas df - df_flows = pd.DataFrame(new_flows) - - # Process features - df_flows = self.process_features(df_flows) - - # Update the flow to the processed version - self.flows = df_flows - except Exception: - self.print("Error in process_flows()") - self.print(traceback.format_exc(), 0, 1) - - def process_flow(self, flow_to_process: dict): - """ - Process one flow. Only used during detection in testing - returns the pandas df with the processed flow - """ - try: - # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) - dflow = self.process_features(raw_flow) - if dflow.empty: - return None - # Update the flow to the processed version - return dflow - except Exception: - # Stop the timer - self.print("Error in process_flow()") - self.print(traceback.format_exc(), 0, 1) - - def detect(self, x_flow) -> Optional[numpy.ndarray]: - """ - Detects the given flow with the current model stored - and returns the predection array - """ - try: - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "endtime", - "flow_source", - "ground_truth_label", - "detailed_ground_truth_label", - ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass - # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) - return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" - ) - self.print(traceback.format_exc(), 0, 1) - - def store_model(self): - """ - Store the trained model on disk - """ - self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: - data = pickle.dumps(self.clf) - f.write(data) - with open(self.scaler_path, "wb") as g: - data = pickle.dumps(self.scaler) - g.write(data) - - def read_model(self): - """ - Read the trained model from disk - """ - try: - self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: - self.clf = pickle.load(f) - self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: - self.scaler = pickle.load(g) - except FileNotFoundError: - # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - except EOFError: - self.print( - "Error reading model from disk. " - "Creating a new empty model.", - 0, - 2, - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - - def set_evidence_malicious_flow(self, flow: dict, twid: str): - confidence: float = 0.1 - description = ( - f"Flow with malicious characteristics by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" - ) - twid_number = int(twid.replace("timewindow", "")) - evidence: Evidence = Evidence( - evidence_type=EvidenceType.MALICIOUS_FLOW, - attacker=Attacker( - direction=Direction.SRC, - ioc_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - ioc_type=IoCType.IP, - value=flow["daddr"], - ), - threat_level=ThreatLevel.LOW, - confidence=confidence, - description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], - timestamp=flow["starttime"], - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], - ) - - self.db.set_evidence(evidence) - - def shutdown_gracefully(self): - # Confirm that the module is done processing - if self.mode == "train": - self.store_model() - - def pre_main(self): - utils.drop_root_privs() - # Load the model - self.read_model() - - def main(self): - if msg := self.get_msg("new_flow"): - # When a new flow arrives - msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] - self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "state": msg["interpreted_state"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) - - if self.mode == "train": - # We are training - - # Is the amount in the DB of labels enough to retrain? - # Use labeled flows - labels = self.db.get_labels() - sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows) - self.last_number_of_flows_when_trained = sum_labeled_flows - - elif self.mode == "test": - # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) - - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: - # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return - - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) - if pred[0] == "Malicious": - # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) - self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 2, - ) +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } + + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): + plt.figure(figsize=(12, 8)) + + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Add more ticks between 0 and 1 (using a logarithmic scale) + plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) + + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.legend() + + # Save the plot + plt.savefig(output_filename) + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 511291517c0ef8a3b791ba1accc72b83363e0425 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:52 +0000 Subject: [PATCH 104/455] Fix plots --- .../plot_testing_performance.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 8f9e12cd86..69b8c96a8c 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['MCC'].append(MCC_values[i]) close_to_1['recall'].append(recall_values[i]) - # Plot metrics for values close to 0 + # Plot metrics for values close to 0 (linear scale) plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + # Plot metrics for values close to 1 (log scale) + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): if 'recall' in metrics_dict: plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - # Apply log scale by default - plt.yscale('log') + # If the plot is close to 1, apply log scale + if not is_close_to_0: + plt.yscale('log') - # If the plot is close to 0, set custom ticks + # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series if is_close_to_0: - # Manually set more Y-ticks for better visibility - plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 - plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) + max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) + + # Avoid log(0), so set the minimum limit a little higher than zero + if min_val == 0: + min_val = 1e-4 # Avoid zero values on the logarithmic scale + + plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 17a9c9a356bc8cf489c80dcc736124a3dc22b7b9 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:23:02 +0000 Subject: [PATCH 105/455] Fix plots --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 69b8c96a8c..de4ada38b3 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale + min_val = 1e-8 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 8561011f8b5d0d3a50932d6f1ff16d90b9986a18 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:25:58 +0000 Subject: [PATCH 106/455] Change plot names --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index de4ada38b3..1b4152c6eb 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") From 75db21d8225a7e8ad9ae41e33b1f64f6e1ccf598 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:26:09 +0000 Subject: [PATCH 107/455] Rename file --- .../{plot_train_score.py => plot_train_performance.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename modules/flowmldetection/{plot_train_score.py => plot_train_performance.py} (97%) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_performance.py similarity index 97% rename from modules/flowmldetection/plot_train_score.py rename to modules/flowmldetection/plot_train_performance.py index 8437e968ac..80e13e9515 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -28,7 +28,7 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + plot_file = os.path.join(dir_name, 'performance_metrics_training.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) From 4a16fd6ebe7893df77dd14898c3270a989193e21 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:32 +0000 Subject: [PATCH 108/455] Recover good flowmldetection deleted by mistake --- modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++----- 1 file changed, 566 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 37f0761109..5e4e9aa462 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,143 +1,566 @@ -import matplotlib.pyplot as plt -import sys -import numpy as np - -def process_file(file_path): - # Initialize the counters for the values - FPR_values = [] - FNR_values = [] - TNR_values = [] - TPR_values = [] - F1_values = [] - accuracy_values = [] - precision_values = [] - MCC_values = [] - recall_values = [] - - # Read the file and extract the data - with open(file_path, 'r') as file: - for line in file: - if "TP:" in line: - # Extract the values from the line - parts = line.split(',') - TP = int(parts[0].split(':')[1].strip()) - TN = int(parts[1].split(':')[1].strip()) - FP = int(parts[2].split(':')[1].strip()) - FN = int(parts[3].split(':')[1].strip()) - - # Calculate metrics - FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 - FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 - TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 - TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 - Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 - Recall = TPR # Recall is the same as TPR - F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 - Accuracy = (TP + TN) / (TP + TN + FP + FN) - MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 - - # Append the values to the respective lists - FPR_values.append(FPR) - FNR_values.append(FNR) - TNR_values.append(TNR) - TPR_values.append(TPR) - F1_values.append(F1) - accuracy_values.append(Accuracy) - precision_values.append(Precision) - MCC_values.append(MCC) - recall_values.append(Recall) - - return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values - -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') - - # Print the final values - print("\nFinal Metric Values:") - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): - plt.figure(figsize=(12, 8)) - - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # Apply log scale by default - plt.yscale('log') - - # If the plot is close to 0, set custom ticks - if is_close_to_0: - # Add more ticks between 0 and 1 (using a logarithmic scale) - plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) - - plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') - plt.legend() - - # Save the plot - plt.savefig(output_filename) - plt.close() - -def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - file_path = sys.argv[1] - - FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) - -if __name__ == "__main__": - main() +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler +import pickle +import pandas as pd +import json +import traceback +import warnings +import os + +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.evidence import ( + Evidence, + ProfileID, + TimeWindow, + Attacker, + ThreatLevel, + EvidenceType, + IoCType, + Direction, + Victim, + Method, +) + +# This horrible hack is only to stop sklearn from printing those warnings +def warn(*args, **kwargs): + pass + + +warnings.warn = warn + + +class FlowMLDetection(IModule): + # Name: short name of the module. Do not use spaces + name = "Flow ML Detection" + description = ( + "Train or test a Machine Learning model to detect malicious flows" + ) + authors = ["Sebastian Garcia"] + + def init(self): + # Subscribe to the channel + self.c1 = self.db.subscribe("new_flow") + self.channels = {"new_flow": self.c1} + self.fieldseparator = self.db.get_field_separator() + # Set the output queue of our database instance + # Read the configuration + self.read_configuration() + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained. Used internally only to know + # when to retrain + self.last_number_of_flows_when_trained = 0 + # The scaler trained during training and to use during testing + self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" + + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + + def read_configuration(self): + conf = ConfigParser() + self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves + self.label = conf.label() + + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): + """ + Train a model based on the flows we receive and the labels + """ + try: + # Create X_flow with the current flows minus the label + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) + # Drop the module_labels + X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) + + # Normalize this batch of data so far. This can get progressivle slow + X_flow = self.scaler.fit_transform(X_flow) + + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } + + # Train + try: + # Online incremental learning + self.clf.partial_fit( + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + ) + except Exception: + self.print("Error while calling clf.train()") + self.print(traceback.format_exc(), 0, 1) + + # See score so far in training + score = self.clf.score(X_flow, y_flow) + + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + + # Store the models on disk + self.store_model() + + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + except Exception: + self.print("Error in train().", 0, 1) + self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") + + def process_features(self, dataset): + """ + Discards some features of the dataset and can create new. + Clean the dataset + """ + try: + # Discard some type of flows that dont have ports + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + for proto in to_discard: + dataset = dataset[dataset.proto != proto] + + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + + # For now, discard these + to_drop = [ + "appproto", + "daddr", + "saddr", + "starttime", + "type_", + "smac", + "dmac", + "history", + "uid", + "dir_", + "endtime", + "flow_source", + ] + for field in to_drop: + try: + dataset = dataset.drop(field, axis=1) + except (ValueError, KeyError): + pass + + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others + # So transform here + dataset["state"] = dataset.apply( + lambda row: self.db.get_final_state_from_flags( + row["state"], (row["spkts"] + row["dpkts"]) + ), + axis=1, + ) + + # Convert state to categorical + dataset.state = dataset.state.str.replace( + r"(^.*Not Established.*$)", "0", regex=True + ) + dataset.state = dataset.state.str.replace( + r"(^.*Established.*$)", "1", regex=True + ) + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... + # We dont use the data to create categories because in testing mode + # we dont see all the protocols + # Also we dont store the Categorizer because the user can retrain + # with its own data. + dataset.proto = dataset.proto.str.lower() + dataset.proto = dataset.proto.str.replace( + r"(^.*tcp.*$)", "0", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*udp.*$)", "1", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp.*$)", "2", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp-ipv6.*$)", "3", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*arp.*$)", "4", regex=True + ) + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + + return dataset + except Exception: + # Stop the timer + self.print("Error in process_features()") + self.print(traceback.format_exc(), 0, 1) + + def process_training_flows(self, last_number_of_flows_when_trained): + """ + Process only the new flows in the DB since the last training. + Store the pandas df in self.flows + """ + try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + + # We get all the flows so far + flows = self.db.get_all_flows() + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB + labels = self.db.get_labels() + if len(labels) == 1: + # Insert fake flows for both classes if needed + new_flows.append( + { + "starttime": 1594417039.029793, + "dur": "1.9424750804901123", + "saddr": "10.7.10.101", + "sport": "49733", + "daddr": "40.70.224.145", + "dport": "443", + "proto": "tcp", + "state": "SF", + "spkts": 17, + "dpkts": 27, + "sbytes": 25517, + "dbytes": 17247, + "appproto": "ssl", + "ground_truth_label": "Malicious", + "module_labels": { + "flowalerts-long-connection": "Malicious" + }, + } + ) + new_flows.append( + { + "starttime": 1382355032.706468, + "dur": "10.896695", + "saddr": "147.32.83.52", + "sport": "47956", + "daddr": "80.242.138.72", + "dport": "80", + "proto": "tcp", + "state": "SF", + "spkts": 1, + "dpkts": 0, + "sbytes": 100, + "dbytes": 67596, + "appproto": "http", + "ground_truth_label": "Benign", + "module_labels": { + "flowalerts-long-connection": "Benign" + }, + } + ) + + # Convert to pandas df + df_flows = pd.DataFrame(new_flows) + + # Process features + df_flows = self.process_features(df_flows) + + # Update the flow to the processed version + self.flows = df_flows + except Exception: + self.print("Error in process_flows()") + self.print(traceback.format_exc(), 0, 1) + + def process_flow(self, flow_to_process: dict): + """ + Process one flow. Only used during detection in testing + returns the pandas df with the processed flow + """ + try: + # Convert the flow to a pandas dataframe + raw_flow = pd.DataFrame(flow_to_process, index=[0]) + dflow = self.process_features(raw_flow) + if dflow.empty: + return None + # Update the flow to the processed version + return dflow + except Exception: + # Stop the timer + self.print("Error in process_flow()") + self.print(traceback.format_exc(), 0, 1) + + def detect(self, x_flow) -> Optional[numpy.ndarray]: + """ + Detects the given flow with the current model stored + and returns the predection array + """ + try: + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", + ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass + # Scale the flow + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) + return pred + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) + self.print(traceback.format_exc(), 0, 1) + + def store_model(self): + """ + Store the trained model on disk + """ + self.print("Storing the trained model and scaler on disk.", 0, 2) + with open(self.model_path, "wb") as f: + data = pickle.dumps(self.clf) + f.write(data) + with open(self.scaler_path, "wb") as g: + data = pickle.dumps(self.scaler) + g.write(data) + + def read_model(self): + """ + Read the trained model from disk + """ + try: + self.print("Reading the trained model from disk.", 0, 2) + with open(self.model_path, "rb") as f: + self.clf = pickle.load(f) + self.print("Reading the trained scaler from disk.", 0, 2) + with open(self.scaler_path, "rb") as g: + self.scaler = pickle.load(g) + except FileNotFoundError: + # If there is no model, create one empty + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + except EOFError: + self.print( + "Error reading model from disk. " + "Creating a new empty model.", + 0, + 2, + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + + def set_evidence_malicious_flow(self, flow: dict, twid: str): + confidence: float = 0.1 + description = ( + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" + ) + twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( + evidence_type=EvidenceType.MALICIOUS_FLOW, + attacker=Attacker( + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], + ), + threat_level=ThreatLevel.LOW, + confidence=confidence, + description=description, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], + ) + + self.db.set_evidence(evidence) + + def shutdown_gracefully(self): + # Confirm that the module is done processing + if self.mode == "train": + self.store_model() + + def pre_main(self): + utils.drop_root_privs() + # Load the model + self.read_model() + + def main(self): + if msg := self.get_msg("new_flow"): + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) + + if self.mode == "train": + # We are training + + # Is the amount in the DB of labels enough to retrain? + # Use labeled flows + labels = self.db.get_labels() + sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. + if ( + sum_labeled_flows >= self.minimum_labels_to_start_train + ): + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows(self.last_number_of_flows_when_trained) + # Train an algorithm + self.train(sum_labeled_flows) + self.last_number_of_flows_when_trained = sum_labeled_flows + + elif self.mode == "test": + # We are testing, which means using the model to detect + processed_flow = self.process_flow(self.flow) + + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: + # Predict + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return + + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malicious": + # Generate an alert + self.set_evidence_malicious_flow(self.flow, self.twid) + self.print( + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 2, + ) \ No newline at end of file From 1fcb086b1a756442a338e39e63634b1c95402d21 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:43 +0000 Subject: [PATCH 109/455] Fix plot test --- modules/flowmldetection/plot_testing_performance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 1b4152c6eb..977a68b2d5 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-8 # Avoid zero values on the logarithmic scale + min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 7b18a530e0525f810109cc4ea78138707a588d24 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:50:33 +0000 Subject: [PATCH 110/455] Add testing code to evaluate performance. It is optional with a varible --- modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5e4e9aa462..b17a1baaf0 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -526,36 +526,21 @@ def main(self): elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: + original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) if pred[0] == "Malicious": # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f"Prediction {pred[0]} for label {label}" + f"Prediction {pred[0]} for label {original_label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -563,4 +548,43 @@ def main(self): f'{self.flow["proto"]}', 0, 2, - ) \ No newline at end of file + ) + + # So you can disable this code easily. Since it is used only for evaluating a testing + log_testing_data = True + if log_testing_data: + # Initialize counters if not already done + if not hasattr(self, 'tp'): + self.tp = 0 + if not hasattr(self, 'tn'): + self.tn = 0 + if not hasattr(self, 'fp'): + self.fp = 0 + if not hasattr(self, 'fn'): + self.fn = 0 + + + # Update counters based on predictions and labels + if pred[0] == "Malicious" and original_label == "Malicious": + self.tp += 1 + elif pred[0] == "Benign" and original_label == "Benign": + self.tn += 1 + elif pred[0] == "Malicious" and original_label == "Benign": + self.fp += 1 + elif pred[0] == "Benign" and original_label == "Malicious": + self.fn += 1 + + testing_log_path = "./modules/flowmldetection/testing_performance.log" + try: + with open(testing_log_path, "a") as log_file: + log_file.write("Testing Performance Log Initialized\n") + # Log the testing performance metrics + log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") + + # Log the original flow for false positives and false negatives + if pred[0] == "Malicious" and original_label == "Benign": + log_file.write(f"False Positive Flow: {self.flow}\n") + elif pred[0] == "Benign" and original_label == "Malicious": + log_file.write(f"False Negative Flow: {self.flow}\n") + except Exception as e: + self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From 4e8cbda03b1b8c7357c818f241b53b67afc86567 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:04:00 +0000 Subject: [PATCH 111/455] Fix plots --- .../plot_testing_performance.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 977a68b2d5..6865415cdf 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import sys import numpy as np +import argparse def process_file(file_path): # Initialize the counters for the values @@ -49,7 +50,7 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): # Separate the values into two groups based on their proximity to 0 or 1 close_to_0 = { 'FPR': [], 'FNR': [] @@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) # Print the final values - print("\nFinal Metric Values:") + print("\nFinal Metric Values for Experiment", experiment_number) print(f"Final FPR: {FPR_values[-1]:.4f}") print(f"Final FNR: {FNR_values[-1]:.4f}") print(f"Final TNR: {TNR_values[-1]:.4f}") @@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu print(f"Final MCC: {MCC_values[-1]:.4f}") print(f"Final Recall: {recall_values[-1]:.4f}") -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): +def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + # Add the experiment number to the plot title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') plt.legend() # Save the plot @@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.close() def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) + # Set up argument parsing + parser = argparse.ArgumentParser(description='Plot testing performance metrics.') + parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') + parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') + + args = parser.parse_args() - file_path = sys.argv[1] + file_path = args.file + experiment_number = args.experiment FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) if __name__ == "__main__": main() From d4cc5625cb18e8207c7aa6e1a42a5a88e3d57134 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:14:51 +0000 Subject: [PATCH 112/455] Fix train plot --- .../flowmldetection/plot_train_performance.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 80e13e9515..244df13d28 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -5,7 +5,7 @@ import argparse import os -def plot_log_data(file_path): +def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() @@ -28,7 +28,8 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + # Append experiment number to the filename + plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) @@ -55,18 +56,18 @@ def plot_log_data(file_path): for i, value in enumerate(df["Total labels"]): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - # Adding title and legend - plt.title('Training performance') + # Adding title and legend with experiment number in title + plt.title(f'Training performance - Experiment {experiment_number}') fig.tight_layout() # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) + plt.subplots_adjust(right=0.75) - # Save plot to the same folder as the log file + # Save plot to the same folder as the log file with experiment number in filename plt.savefig(plot_file) # Display the plot @@ -75,13 +76,14 @@ def plot_log_data(file_path): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") + parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") # Handle -h / --help args = parser.parse_args() # Call the function to process the log file - plot_log_data(args.log_file) + plot_log_data(args.file, args.experiment) if __name__ == "__main__": main() From d3b0190e39beb89cffaf8ad51a2cec0d787f7920 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:14:48 +0000 Subject: [PATCH 113/455] Fix plots --- .../flowmldetection/plot_train_performance.py | 122 ++++++++++-------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 244df13d28..5212dfeeaf 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,85 +4,105 @@ import sys import argparse import os +import matplotlib.ticker as ticker def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + # Regex pattern for the new log format + pattern = ( + r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " + r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " + r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." + ) # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + columns = [ + "Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" + ] + df = pd.DataFrame(data, columns=columns) df = df.astype({ + "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "Total labels": float, - "Score": float + "FPR": float, + "TNR": float, + "TPR": float, + "FNR": float, + "F1": float, + "Precision": float, + "Accuracy": float, + "MCC": float, + "Recall": float, }) - # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - # Append experiment number to the filename - plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + # --- Plot 1: Number of labels (linear scale, no total labels) --- + fig1, ax1 = plt.subplots(figsize=(10, 6)) + ax1.plot(df.index, df["Background"], label="Background", color='black') + ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') + ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend with experiment number in title - plt.title(f'Training performance - Experiment {experiment_number}') - fig.tight_layout() + ax1.set_ylabel('Label Counts') + # No log scale here + ax1.set_title(f'Label Counts - Experiment {experiment_number}') + ax1.legend() + ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + + # --- Plot 2: FNR and FPR (log scale) --- + fig2, ax2 = plt.subplots(figsize=(10, 6)) + ax2.plot(df.index, df["FNR"], label="FNR", color='red') + ax2.plot(df.index, df["FPR"], label="FPR", color='blue') + ax2.set_xlabel('Index') + ax2.set_ylabel('Rate') + ax2.set_yscale('log') + ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') + ax2.legend() + ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + + # --- Plot 3: Other metrics (log scale) --- + fig3, ax3 = plt.subplots(figsize=(12, 7)) + metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] + colors_rest = [ + 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', + 'tab:gray', 'tab:pink', 'tab:olive' + ] + for metric, color in zip(metrics_rest, colors_rest): + ax3.plot(df.index, df[metric], label=metric, color=color) + ax3.set_xlabel('Index') + ax3.set_ylabel('Metric Value') + ax3.set_yscale('log') + ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') + ax3.legend() + ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) - - # Save plot to the same folder as the log file with experiment number in filename - plt.savefig(plot_file) - - # Display the plot plt.show() + # --- Print final values in terminal --- + print("\nFinal values at last training step:") + for col in ["Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: + print(f"{col}: {df[col].iloc[-1]}") + def main(): - # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") - - # Handle -h / --help args = parser.parse_args() - - # Call the function to process the log file plot_log_data(args.file, args.experiment) if __name__ == "__main__": From aa8331fa4417cf3912a623528607f7480edcb796 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:16:01 +0000 Subject: [PATCH 114/455] Add performance metrics to the training evaluation --- modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++----- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b17a1baaf0..2c60cd4034 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,16 @@ import json import traceback import warnings -import os +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import ( + confusion_matrix, + f1_score, + precision_score, + accuracy_score, + matthews_corrcoef, + recall_score, +) + from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -86,21 +95,21 @@ def write_to_training_log(self, message: str): except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows): + def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ Train a model based on the flows we receive and the labels """ try: + # Create y_flow with the label + y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - # Normalize this batch of data so far. This can get progressivle slow + # Normalize this batch of data so far. This can get progressively slow X_flow = self.scaler.fit_transform(X_flow) # Count the number of labels of each type in this epoc @@ -120,18 +129,43 @@ def train(self, sum_labeled_flows): self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # See score so far in training - score = self.clf.score(X_flow, y_flow) + # Predict on the training data + y_pred = self.clf.predict(X_flow) - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + # For metrics, let's focus on Malicious vs Benign (ignore Background) + mask = (y_flow == "Malicious") | (y_flow == "Benign") + y_true_bin = y_flow[mask] + y_pred_bin = y_pred[mask] + + # Map to binary: Malicious=1, Benign=0 + y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) + y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + + # Compute confusion matrix: tn, fp, fn, tp + tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + + # Compute metrics + FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 + TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 + TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 + FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 + F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) + PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) + ACCU = accuracy_score(y_true_bin, y_pred_bin) + MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk self.store_model() # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + self.write_to_training_log( + f"Total labels: {sum_labeled_flows}, " + f"Background: {epoch_label_counts['Background']}. " + f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + ) except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) @@ -520,7 +554,7 @@ def main(self): # for pandas self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train(sum_labeled_flows) + self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From bbd6e0a0e40db29a29481ac4839b4efa42252b34 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sun, 4 May 2025 12:50:46 +0000 Subject: [PATCH 115/455] Fix experiment names --- modules/flowmldetection/plot_train_performance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 5212dfeeaf..304f0f4ead 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number): ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') ax1.set_ylabel('Label Counts') - # No log scale here ax1.set_title(f'Label Counts - Experiment {experiment_number}') ax1.legend() ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) # --- Plot 2: FNR and FPR (log scale) --- fig2, ax2 = plt.subplots(figsize=(10, 6)) @@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number): ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') ax2.legend() ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) # --- Plot 3: Other metrics (log scale) --- fig3, ax3 = plt.subplots(figsize=(12, 7)) @@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number): ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') ax3.legend() ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) plt.show() From 416bc48fd70e9f92b8a4cf4a192ae9f05a2ce4fc Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 116/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e44ac83f4d..16b67e9038 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -120,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -132,7 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -155,15 +291,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From 82ff65455c8cea8514ef0285aaf98846a34eb8e8 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 117/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 3e4bf3fbb9df71feb63e125ddae50e54b6a375f1 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 118/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 45710b72db50551053c09ed71059fa5d1bfcf712 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 119/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 16b67e9038..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf62..b4b2128d3d 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 014ee473003b36f3680b7f40aa60dd9c7d4ae759 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 120/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From d5f6330c3e6bdb0a8f81d0f1349f927bfda8636e Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 121/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 2ad9ccb25fcd46b9da91c72f3400de5ae3ec364e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 122/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 88cb54b68e0da3088fc76cc8b351c5653aa93857 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 123/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 13dfd28bb8915c4c61798efa0051daaa7ee9daa9 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 124/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From d2a5935f05c2aee6c5cb4ae7285151258f836e13 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 125/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 60f0b286cb0b1172b01416bd66a45a117fa55577 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 126/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 6a05fa3efdb4254380541b7b7c32c4c02f829cf7 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 127/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 77c9a10ce27447cf87d9d4720132554bc1cb9f5c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 128/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 6e4841ffcffad999d14dcbc1354dbccc8f2cc546 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 129/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 0d8414e9937ce34aac0b6fbb7fe328c7d207ead6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 130/455] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index b4b2128d3d..e8ca3aaf62 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 97f86a5709fad2144a97f643b8381cd48f86b148 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 131/455] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..c06755a599 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From c468366fca89d0fe5d6d00ec8c660f62ed616b46 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 132/455] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115bd..0000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 952c3b29c7c6637bd78b66c1c2fc9a333f72a5d0 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 133/455] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c06755a599..87e07c7592 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From e7af6dc61e06a5759e5c5ddfcfe3ffadcdf67fb6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 134/455] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 87e07c7592..e91495d649 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", @@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -437,18 +441,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -461,23 +463,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -497,8 +507,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -506,9 +516,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 1dbde99abda8734e06222b1149806e1b626d2602 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 135/455] Fix the profiler handler for cases of nan in state --- .../core/database/redis_db/profile_handler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 0489372cdc..1ea7644648 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -379,7 +379,12 @@ def get_final_state_from_flags(self, state, pkts): We receive the pakets to distinguish some Reset connections """ try: - pre = state.split("_")[0] + # In some flows the state is a nan + try: + pre = state.split("_")[0] + except AttributeError: + pre = '' + try: # Try suricata states """ @@ -401,7 +406,11 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - suf = state.split("_")[1] + # In some flows the state is a nan + try: + suf = state.split("_")[1] + except AttributeError: + suf = '' if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -502,7 +511,7 @@ def get_final_state_from_flags(self, state, pkts): except Exception: exception_line = sys.exc_info()[2].tb_lineno self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", + f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}", 0, 1, ) From 2c3a9eb2363c14e89d08bda2d8f7698c41f148a3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:20:42 +0100 Subject: [PATCH 136/455] slips.yaml. Update to have correct labels. By default test. Defaul training lbel is benign --- config/slips.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index f7089b41af..8736eaf511 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -106,13 +106,12 @@ parameters: deletePrevdb: true # Set the label for all the flows that are being read. - # For now only normal and malware directly. No option for setting labels - # with a filter + # For now only Benign and Malicious (Capitalized) # The purpose is to be used in the training of ML models and to output # flows with labels for other tools. - # label: malicious - # label: unknown - label: normal + # label: Malicious + # label: Benign + label: Benign # If Zeek files are rotated or not to avoid running out of disk. # Zeek rotation is enabled by default when using an interface, # which means Slips will delete all Zeek log files after 1 day From 38bdc30b059f670dde6817a575504f7f308f9ad0 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:21:21 +0100 Subject: [PATCH 137/455] First ipython to tst ML flow related models --- modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb new file mode 100644 index 0000000000..d726cd2805 --- /dev/null +++ b/modules/flowmldetection/flowmlanalysis.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of Flows with Machine Learning for Slips" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis of a fixed list of flows to try techniques and find parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "import pickle\n", + "import pandas as pd\n", + "import json\n", + "import traceback\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "slips-new", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 8b2e850f150389ad00d4c10d65abd7c94f5b58fb Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 138/455] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e91495d649..58b4ce1e4c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From b179fac4ade82c0d1716ad13428a606e25f4fae9 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 139/455] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 58b4ce1e4c..4a4d46e376 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From dd98ff1307fc64e517d3eff4a80301e6be8dd1e3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:26:42 +0100 Subject: [PATCH 140/455] profile_handler. Small bug in how we handled the profiles, we were using 'in' instead of == for established. Some not established MAY not have been correctly captured --- slips_files/core/database/redis_db/profile_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 1ea7644648..85fdec5a63 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -393,9 +393,10 @@ def get_final_state_from_flags(self, state, pkts): these are: New, Established and Closed,for UDP only new and established. For each of these states Suricata can employ different timeouts. """ - if "new" in state or "established" in state: + # This is controversial, but if we dont have a good state, we consider it not established for now + if "new" in state or state.lower() == "established": return "Established" - elif "closed" in state: + elif "closed" in state or state.lower() == 'not established': return "Not Established" # We have varius type of states depending on the type of flow. @@ -406,7 +407,6 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From 4de77d6fcb7a8acf5e2a1510950e28a285084344 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 141/455] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1073 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 411 zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice! z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWpQXRJ|Emr(~!@tY-9P0&;YE zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1 z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8 zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5 delta 380 zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN zW*@ diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch literal 890 zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD< z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*|B)5DrlQdy7+(!%6#F{QHuBFo&v zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~ zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E; z8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_ delta 525 zcmeyxHj9S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5mhSAF{H=RwvmLf> z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%YfQ^>gH&y>a4SnPazubB5!cqc1YnCGT))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4d&dkpP QMw@}2k%^wsl+q+U0P2|NZU6uP From 03e408119146041e40ad6c29370f1694eb1e40a8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 142/455] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 143/455] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4a4d46e376..d8e9ada27c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 17ccb096b61ea71a780e182a2bb0626985e4c755 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 144/455] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 145/455] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From b58ca823684868b27c4357999e55d97d2b75ad4a Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 146/455] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From cdfd04f0667647f4d3d4a47bb56d7f6d7edc00d6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 147/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 317 +++++++++++++-------- 1 file changed, 206 insertions(+), 111 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d8e9ada27c..8917fef6a5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -55,12 +56,8 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained - self.last_number_of_flows_when_trained = 0 + # Minum amount of new lables needed to trigger the train + self.minimum_lables_to_retrain = 50 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -71,25 +68,26 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() - self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) + # Process the labels to have only Normal and Malware + self.flows.label = self.flows.label.str.replace( + r"(^.*ormal.*$)", "Normal", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alware.*$)", "Malware", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alicious.*$)", "Malware", regex=True + ) - # Create X_flow with the current flows minus the label + # Separate + y_flow = self.flows["label"] X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) - # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -98,7 +96,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Malware", "Normal"] ) except Exception: self.print("Error while calling clf.train()") @@ -121,7 +119,142 @@ def train(self): self.store_model() except Exception: - self.print("Error in train().", 0, 1) + self.print("Error in train()", 0, 1) + self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -135,11 +268,6 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - # For now, discard these to_drop = [ "appproto", @@ -152,7 +280,9 @@ def process_features(self, dataset): "history", "uid", "dir_", + "dbytes", "endtime", + "bytes", "flow_source", ] for field in to_drop: @@ -161,16 +291,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -204,11 +330,7 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ + fields_to_convert_to_flow = [ dataset.proto, dataset.dport, dataset.sport, @@ -219,10 +341,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_float: + for field in fields_to_convert_to_flow: try: field = field.astype("float64") - except (ValueError, AttributeError): + except ValueError: pass return dataset @@ -231,9 +353,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_flows(self): """ - Process all the flows in the DB + Process all the flwos in the DB Store the pandas df in self.flows """ try: @@ -249,48 +371,44 @@ def process_training_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. + # self.print(f'Amount of labeled flows: {labels}', 0, 1) flows.append( { - "starttime": 1594417039.029793, + "ts": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, + "state": "Established", + "allbytes": 42764, + "spkts": 37, "sbytes": 25517, - "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "label": "Malware", "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": "Malware" }, } ) flows.append( { - "starttime": 1382355032.706468, + "ts": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "SF", + "state": "Established", + "allbytes": 67696, "spkts": 1, - "dpkts": 0, "sbytes": 100, - "dbytes": 67596, "appproto": "http", - "label": "Benign", + "label": "Normal", "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": "Normal" }, } ) @@ -318,8 +436,6 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) - if dflow.empty: - return None # Update the flow to the processed version return dflow except Exception: @@ -333,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: + given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -340,28 +457,12 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", + "dbytes", + "dpkts", "endtime", + "bytes", "flow_source", - "ground_truth_label", # todo now we can use them - "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -373,7 +474,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" + f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -465,16 +566,18 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - # When a new flow arrives msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] + twid = msg["twid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original + # these fields are expected in testing. update the original # flow dict to have them self.flow.update( { + "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), + # the flow["state"] is the origstate, we dont need that here + # we need the interpreted state "state": msg["interpreted_state"], + "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -487,31 +590,23 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_labels_to_start_train + sum_labeled_flows >= self.minimum_lables_to_retrain + and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows() - # Train an algorithm - self.train() - self.last_number_of_flows_when_trained = sum_labeled_flows - + # We get here every 'self.minimum_lables_to_retrain' + # amount of labels + # So for example we retrain every 100 labels and only when + # we have at least 100 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_flows() + # Train an algorithm + self.train() elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -531,8 +626,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' + f"Report Prediction {pred[0]} for label" + f' {label} flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -540,9 +635,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malicious": + if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) + self.set_evidence_malicious_flow(self.flow, twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 6f548d14a61187b042083d8233a9d68f4dc9e525 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 148/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 63dc0bd420f3ad6a4390d17b5ee9ce34de8774f5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 149/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 606fc6713ea8a9973d59696e813c708c2cdd64d6 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 150/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 8917fef6a5..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf62..b4b2128d3d 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 3a13d07707eb85b773bcc61abd93d4d8294dc846 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 151/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..12c3589edc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From b09101cf20d63c822cd82269e40dd9edb17ee624 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 152/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589edc..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 7c50d01107a6bac1ad4e22e67d9a56c9e75af2ca Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 153/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..12c3589edc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e7c5d824bac46fd7d95499f020a2183e981efdb1 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 154/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589edc..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From b350dcea25090b195f6befbf434f2b4506350b2e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 155/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..12c3589edc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 49ddfddfd34f0754927332be8e7b61cfa23553f3 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 156/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589edc..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 456bd7208ababe3b0081b46380466f1301f02c2f Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 157/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 163 +++++++++++++++++++-- 1 file changed, 149 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..c8226368c7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -8,6 +8,7 @@ import pickle import pandas as pd import json +import datetime import traceback import warnings import sys @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -156,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -393,21 +524,25 @@ def read_model(self): def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 description = ( - f"Flow with malicious characteristics by ML. Src IP" + f"Malicious flow by ML. Src IP" f" {flow['saddr']}:{flow['sport']} to " f"{flow['daddr']}:{flow['dport']}" ) + + timestamp = utils.convert_format( + datetime.datetime.now(), utils.alerts_format + ) twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( direction=Direction.SRC, - ioc_type=IoCType.IP, + attacker_type=IoCType.IP, value=flow["saddr"], ), victim=Victim( direction=Direction.DST, - ioc_type=IoCType.IP, + victim_type=IoCType.IP, value=flow["daddr"], ), threat_level=ThreatLevel.LOW, @@ -416,7 +551,7 @@ def set_evidence_malicious_flow(self, flow: dict, twid: str): profile=ProfileID(ip=flow["saddr"]), timewindow=TimeWindow(twid_number), uid=[flow["uid"]], - timestamp=flow["starttime"], + timestamp=timestamp, method=Method.AI, src_port=flow["sport"], dst_port=flow["dport"], From 592edafb650e53bd0d2bcbc5bf94e5488e2807f7 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 158/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115bd..b671a09a28 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 759e8597228c569727eb85c9c40aa5130903602f Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 159/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 07542a4d60f8828af3adb6b11de50356cd760dee Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 160/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c7..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 748f2d35ebab0b22a0e993f7165c7fb6140d2749 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 161/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..94eb27afdf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 689dc79ef2926c581b2f0b9d7a4fd75a186f12ba Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 162/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 94eb27afdf..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 34ca9a52592e632e6ea5d28dd486b84c0175fee1 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 163/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 150 +++++++++++++++++++-- 1 file changed, 140 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..c8226368c7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -157,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 4bd6701d5fbdc655bf1b08b34cbfd3089ea0b852 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 164/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115bd..b671a09a28 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 0fa7bb66ea522aeaa6bc7ef6a128436cc38f61d9 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 165/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 26ef89d64d54e0b89815867791b76e31164fc076 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 166/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c7..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From e5902bd3d82d7454fabf81106c7df10f5ca2472f Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 167/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 438 +++++++++++++-------- 1 file changed, 278 insertions(+), 160 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..124ec61f91 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,8 +1,3 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -10,13 +5,10 @@ import json import datetime import traceback -import warnings import sys -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( +from slips_files.common.imports import * +from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, TimeWindow, @@ -25,8 +17,7 @@ EvidenceType, IoCType, Direction, - Victim, - Method, + IDEACategory, ) # Only for debbuging @@ -38,6 +29,8 @@ def warn(*args, **kwargs): pass +import warnings + warnings.warn = warn @@ -63,8 +56,6 @@ def init(self): # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() @@ -122,6 +113,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -130,7 +256,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -139,35 +265,28 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "starttime", + "ts", + "origstate", "type_", - "smac", - "dmac", - "history", - "uid", "dir_", + "history", "dbytes", - "endtime", - "bytes", - "flow_source", + "dpkts", + "smac", + "dmac", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): + except ValueError: pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -201,23 +320,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_flow: - try: - field = field.astype("float64") - except ValueError: - pass - + dataset.proto = dataset.proto.astype("float64") + try: + # Convert dport to float + dataset.dport = dataset.dport.astype("float") + except ValueError: + pass + try: + # Convert sport to float + dataset.sport = dataset.sport.astype("float") + except ValueError: + pass + try: + # Convert Dur to float + dataset.dur = dataset.dur.astype("float") + except ValueError: + pass + try: + # Convert TotPkts to float + dataset.pkts = dataset.pkts.astype("float") + except ValueError: + pass + try: + # Convert SrcPkts to float + dataset.spkts = dataset.spkts.astype("float") + except ValueError: + pass + try: + # Convert TotBytes to float + dataset.allbytes = dataset.allbytes.astype("float") + except ValueError: + pass + try: + # Convert SrcBytes to float + dataset.sbytes = dataset.sbytes.astype("float") + except ValueError: + pass return dataset except Exception: # Stop the timer @@ -233,6 +371,7 @@ def process_flows(self): # We get all the flows so far # because this retraining happens in batches flows = self.db.get_all_flows() + # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -252,7 +391,9 @@ def process_flows(self): "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 84, "allbytes": 42764, "spkts": 37, "sbytes": 25517, @@ -272,7 +413,9 @@ def process_flows(self): "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 67, "allbytes": 67696, "spkts": 1, "sbytes": 100, @@ -298,55 +441,42 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self, flow_to_process: dict): + def process_flow(self): """ Process one flow. Only used during detection in testing - returns the pandas df with the processed flow + Store the pandas df in self.flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) + raw_flow = pd.DataFrame(self.flow_dict, index=[0]) + # Process features dflow = self.process_features(raw_flow) # Update the flow to the processed version - return dflow + self.flow = dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self, x_flow) -> Optional[numpy.ndarray]: + def detect(self): """ - Detects the given flow with the current model stored - and returns the predection array + Detect this flow with the current model stored """ try: - given_x_flow = x_flow - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "dbytes", - "dpkts", - "endtime", - "bytes", - "flow_source", - ] - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass + # Store the real label if there is one + y_flow = self.flow["label"] + # remove the real label column + self.flow = self.flow.drop("label", axis=1) + # remove the label predictions column of the other modules + X_flow = self.flow.drop("module_labels", axis=1) # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) + X_flow = self.scaler.transform(X_flow) + pred = self.clf.predict(X_flow) return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" - ) + except Exception: + # Stop the timer + self.print("Error in detect() X_flow:") + self.print(X_flow) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -354,10 +484,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: + with open("./modules/flowmldetection/model.bin", "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open(self.scaler_path, "wb") as g: + with open("./modules/flowmldetection/scaler.bin", "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -367,23 +497,20 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: + with open("./modules/flowmldetection/model.bin", "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: + with open("./modules/flowmldetection/scaler.bin", "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) + self.print("There was no model. Creating a new empty model.", 0, 2) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. " - "Creating a new empty model.", + "Error reading model from disk. Creating a new empty model.", 0, 2, ) @@ -391,40 +518,39 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow(self, flow: dict, twid: str): + def set_evidence_malicious_flow( + self, + saddr: str, + sport: str, + daddr: str, + dport: str, + twid: str, + uid: str, + ): confidence: float = 0.1 + ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" + f"Malicious flow by ML. Src IP {saddr}:{sport} to " + f"{daddr}:{dport} {ip_identification}" ) timestamp = utils.convert_format( datetime.datetime.now(), utils.alerts_format ) - twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, - attacker_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - victim_type=IoCType.IP, - value=flow["daddr"], + direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], + profile=ProfileID(ip=saddr), + timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), + uid=[uid], timestamp=timestamp, - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], + category=IDEACategory.ANOMALY_TRAFFIC, ) self.db.set_evidence(evidence) @@ -441,22 +567,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - msg = json.loads(msg["data"]) - twid = msg["twid"] - self.flow = msg["flow"] - # these fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state - "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) + data = msg["data"] + # Convert from json to dict + data = json.loads(data) + profileid = data["profileid"] + twid = data["twid"] + # Get flow that is now in json format + flow = data["flow"] + # Convert flow to a dict + flow = json.loads(flow) + # Convert the common fields to something that can + # be interpreted + # Get the uid which is the key + uid = next(iter(flow)) + self.flow_dict = json.loads(flow[uid]) if self.mode == "train": # We are training @@ -469,57 +593,51 @@ def main(self): sum_labeled_flows >= self.minimum_lables_to_retrain and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels + # We get here every 'self.minimum_lables_to_retrain' amount of labels + # So for example we retrain every 100 labels and only when we have at least 100 labels self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." + f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." ) - # Process all flows in the DB and make them ready - # for pandas + # Process all flows in the DB and make them ready for pandas self.process_flows() # Train an algorithm self.train() elif self.mode == "test": # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) + self.process_flow() - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: + # After processing the flow, it may happen that we delete icmp/arp/etc + # so the dataframe can be empty + if self.flow is not None and not self.flow.empty: # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return + pred = self.detect() + label = self.flow_dict["label"] - label = self.flow["label"] + # Report if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode + # If the user specified a label in test mode, and the label + # is diff from the prediction, print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 3, ) if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow( + self.flow_dict["saddr"], + self.flow_dict["sport"], + self.flow_dict["daddr"], + self.flow_dict["dport"], + twid, + uid, + ) self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 2, ) From e5ee4b746411b114c9a96fc98aa97d130a75faee Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 168/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115bd..b671a09a28 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 840822e5c232b4b3fefa206b99b331759ff2877d Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 169/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From a30b45c3016d25e45c6038b66e25eb155c6a72c3 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 170/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 169 +++------------------ 1 file changed, 19 insertions(+), 150 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 124ec61f91..c57a7a3581 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,9 +5,13 @@ import json import datetime import traceback -import sys +import warnings + -from slips_files.common.imports import * +from slips_files.common.state_handler import get_final_state_from_flags +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, @@ -29,8 +33,6 @@ def warn(*args, **kwargs): pass -import warnings - warnings.warn = warn @@ -113,141 +115,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -281,12 +148,17 @@ def process_features(self, dataset): except ValueError: pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -370,7 +242,7 @@ def process_flows(self): try: # We get all the flows so far # because this retraining happens in batches - flows = self.db.get_all_flows() + flows: list = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware @@ -464,7 +336,7 @@ def detect(self): """ try: # Store the real label if there is one - y_flow = self.flow["label"] + # y_flow = self.flow["label"] # remove the real label column self.flow = self.flow.drop("label", axis=1) # remove the label predictions column of the other modules @@ -568,13 +440,10 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): data = msg["data"] - # Convert from json to dict data = json.loads(data) - profileid = data["profileid"] + # profileid = data["profileid"] twid = data["twid"] - # Get flow that is now in json format flow = data["flow"] - # Convert flow to a dict flow = json.loads(flow) # Convert the common fields to something that can # be interpreted From bf4c8cf95ca6cfa2d28ca270560e9001fd6f127c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:36:55 +0200 Subject: [PATCH 171/455] mlflow. Ignore UID column --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c57a7a3581..e2aa1e0ee3 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -141,6 +141,7 @@ def process_features(self, dataset): "dpkts", "smac", "dmac", + "uid", ] for field in to_drop: try: From 59a109713f00126acf7633e9435156c49b5ec580 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 172/455] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index b4b2128d3d..e8ca3aaf62 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From bb582a55c2a8460bcf408204dc175207b2499682 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 173/455] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115bd..0000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From b586ac78776b01465a9476771ccec69b3df635c3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 174/455] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e2aa1e0ee3..9269b67012 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -154,7 +154,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From 5ccc0dd3da3eb9f31c5b4a2ab5dbdf89e9b32898 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 175/455] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 378 +++++++++++---------- 1 file changed, 197 insertions(+), 181 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9269b67012..e6ea0b5171 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,18 +1,20 @@ +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle import pandas as pd import json -import datetime import traceback import warnings - -from slips_files.common.state_handler import get_final_state_from_flags from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule -from slips_files.core.evidence_structure.evidence import ( +from slips_files.core.structures.evidence import ( Evidence, ProfileID, TimeWindow, @@ -21,7 +23,8 @@ EvidenceType, IoCType, Direction, - IDEACategory, + Victim, + Method, ) # Only for debbuging @@ -52,36 +55,41 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -90,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -113,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -123,7 +131,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -132,21 +140,20 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "ts", - "origstate", + "starttime", "type_", - "dir_", - "history", - "dbytes", - "dpkts", "smac", "dmac", + "history", "uid", + "dir_", + "endtime", + "flow_source", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except ValueError: + except (ValueError, KeyError): pass # When flows are read from Slips sqlite, @@ -155,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -193,58 +199,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - dataset.proto = dataset.proto.astype("float64") - try: - # Convert dport to float - dataset.dport = dataset.dport.astype("float") - except ValueError: - pass - try: - # Convert sport to float - dataset.sport = dataset.sport.astype("float") - except ValueError: - pass - try: - # Convert Dur to float - dataset.dur = dataset.dur.astype("float") - except ValueError: - pass - try: - # Convert TotPkts to float - dataset.pkts = dataset.pkts.astype("float") - except ValueError: - pass - try: - # Convert SrcPkts to float - dataset.spkts = dataset.spkts.astype("float") - except ValueError: - pass - try: - # Convert TotBytes to float - dataset.allbytes = dataset.allbytes.astype("float") - except ValueError: - pass - try: - # Convert SrcBytes to float - dataset.sbytes = dataset.sbytes.astype("float") - except ValueError: - pass + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + return dataset except Exception: # Stop the timer self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: # We get all the flows so far # because this retraining happens in batches - flows: list = self.db.get_all_flows() - + flows = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -254,48 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 84, - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 67, - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -314,42 +304,51 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self): + def process_flow(self, flow_to_process: dict): """ Process one flow. Only used during detection in testing - Store the pandas df in self.flow + returns the pandas df with the processed flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(self.flow_dict, index=[0]) - # Process features + raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) # Update the flow to the processed version - self.flow = dflow + return dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self): + def detect(self, x_flow) -> Optional[numpy.ndarray]: """ - Detect this flow with the current model stored + Detects the given flow with the current model stored + and returns the predection array """ try: - # Store the real label if there is one - # y_flow = self.flow["label"] - # remove the real label column - self.flow = self.flow.drop("label", axis=1) - # remove the label predictions column of the other modules - X_flow = self.flow.drop("module_labels", axis=1) + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + ] + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass # Scale the flow - X_flow = self.scaler.transform(X_flow) - pred = self.clf.predict(X_flow) + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) return pred - except Exception: - # Stop the timer - self.print("Error in detect() X_flow:") - self.print(X_flow) + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -357,10 +356,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "wb") as f: + with open(self.model_path, "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open("./modules/flowmldetection/scaler.bin", "wb") as g: + with open(self.scaler_path, "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -370,20 +369,23 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "rb") as f: + with open(self.model_path, "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open("./modules/flowmldetection/scaler.bin", "rb") as g: + with open(self.scaler_path, "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print("There was no model. Creating a new empty model.", 0, 2) + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. Creating a new empty model.", + "Error reading model from disk. " + "Creating a new empty model.", 0, 2, ) @@ -391,39 +393,36 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow( - self, - saddr: str, - sport: str, - daddr: str, - dport: str, - twid: str, - uid: str, - ): + def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 - ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP {saddr}:{sport} to " - f"{daddr}:{dport} {ip_identification}" - ) - - timestamp = utils.convert_format( - datetime.datetime.now(), utils.alerts_format + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" ) - + twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=saddr), - timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), - uid=[uid], - timestamp=timestamp, - category=IDEACategory.ANOMALY_TRAFFIC, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], ) self.db.set_evidence(evidence) @@ -440,17 +439,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - data = msg["data"] - data = json.loads(data) - # profileid = data["profileid"] - twid = data["twid"] - flow = data["flow"] - flow = json.loads(flow) - # Convert the common fields to something that can - # be interpreted - # Get the uid which is the key - uid = next(iter(flow)) - self.flow_dict = json.loads(flow[uid]) + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) if self.mode == "train": # We are training @@ -459,55 +461,69 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' amount of labels - # So for example we retrain every 100 labels and only when we have at least 100 labels - self.print( - f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect - self.process_flow() + processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we delete icmp/arp/etc - # so the dataframe can be empty - if self.flow is not None and not self.flow.empty: + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: # Predict - pred = self.detect() - label = self.flow_dict["label"] + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return - # Report + label = self.flow["label"] if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, and the label - # is diff from the prediction, print in debug mode + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode self.print( - f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow( - self.flow_dict["saddr"], - self.flow_dict["sport"], - self.flow_dict["daddr"], - self.flow_dict["dport"], - twid, - uid, - ) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 2, ) From fe91a3c6a427b86f3957864dcdea67a52b7a861d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 176/455] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e6ea0b5171..0fa1e4d767 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From 31d8b921d59719a665de7b0195eeac37e2ad7d81 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 177/455] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0fa1e4d767..5c5f9943f1 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From 689b570abe330277d9af665e0d99b6ae2354d384 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 178/455] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1124 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 130 zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KL^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8 z4q_05T(v)g91VHfmFeIMvRKFpJJ~89v lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl# diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch delta 290 zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGcfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXo Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 179/455] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 180/455] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5c5f9943f1..fe950ed4bb 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "endtime", "flow_source", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From fba965a9409ff15bbb4ed677fe658f85c1b1b02a Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 181/455] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 182/455] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 844a04314f76516c8ec2afaf8c3cc040955c62a2 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 183/455] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 70c222ea6b8661e903dfc4ae93855d8ee2614ca5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:49:23 +0000 Subject: [PATCH 184/455] Add plot for flowml train scores --- modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py new file mode 100644 index 0000000000..0b5b5b72ba --- /dev/null +++ b/modules/flowmldetection/plot_train_score.py @@ -0,0 +1,56 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re +import sys + +def plot_log_data(file_path): + # Read the log data from the file + with open(file_path, 'r') as file: + log_data = file.read() + + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + + # Parse the log file + data = re.findall(pattern, log_data) + + # Convert data to a DataFrame + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + df = df.astype({ + "Background": int, + "Benign": int, + "Malicious": int, + "Total labels": float, + "Score": float + }) + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plotting Score on the left y-axis + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + ax1.set_xlabel('Index') + ax1.set_ylabel('Score', color='tab:blue') + ax1.tick_params(axis='y', labelcolor='tab:blue') + + # Create the second y-axis for the Total labels + ax2 = ax1.twinx() + ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') + ax2.set_ylabel('Total labels', color='tab:red') + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Adding title and legend + plt.title('Log Data Visualization') + fig.tight_layout() + + # Save plot to a PNG file + plt.savefig('log_data_plot_with_two_scales.png') + + # Display the plot + plt.show() + +# Make sure the file path is passed as an argument +if len(sys.argv) < 2: + print("Please provide the path to the log file as a parameter.") +else: + plot_log_data(sys.argv[1]) From a721639f4e90e0db5e9464b7fda27454e305ab5f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:04 +0000 Subject: [PATCH 185/455] Add a log file to store the training data output --- modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fe950ed4bb..60217ada28 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -68,12 +68,29 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves self.label = conf.label() - def train(self): + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): """ Train a model based on the flows we receive and the labels """ From 2d65486fa55caae847d9cfb709e8aedf57b2b7d6 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:32 +0000 Subject: [PATCH 186/455] Store data in the log file of training --- modules/flowmldetection/flowmldetection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 60217ada28..6f732da636 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -137,9 +137,13 @@ def train(self, sum_labeled_flows): # Store the models on disk self.store_model() + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ From b0324a55a34f5e2f5780bfb755863fbe6662dcc7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:53 +0000 Subject: [PATCH 187/455] better comments --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6f732da636..ed3aecf1b0 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -59,10 +59,9 @@ def init(self): self.minimum_labels_to_start_train = 50 # Minum amount of new labels needed to retrain self.minimum_labels_to_retrain = 50 - # The number of flows when last trained + # The number of flows when last trained. Used internally only to know + # when to retrain self.last_number_of_flows_when_trained = 0 - # To plot the scores of training - # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" From 1e91a10fa051a06cb27ebf5e9e0c505fe4210f32 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:30 +0000 Subject: [PATCH 188/455] Fix issue not dropping detailed labels --- modules/flowmldetection/flowmldetection.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index ed3aecf1b0..25b30cf515 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -94,23 +94,19 @@ def train(self, sum_labeled_flows): Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Train try: self.clf.partial_fit( From d97a4ddb3e8af4bee1cbe98d980e55fe5b8f8139 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:53 +0000 Subject: [PATCH 189/455] Fix issue that not all labels sere given to the partial fit --- modules/flowmldetection/flowmldetection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 25b30cf515..b2d0db5e51 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -109,8 +109,9 @@ def train(self, sum_labeled_flows): # Train try: + # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") From 10560192bfae39975002f518114f03ad2d56ed83 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:52:08 +0000 Subject: [PATCH 190/455] count partial labels in this epoch --- modules/flowmldetection/flowmldetection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b2d0db5e51..1146091a92 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -106,6 +106,12 @@ def train(self, sum_labeled_flows): # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } # Train try: From 2a61b4608e234655f284cac29951f33c756bc7f9 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:09 +0000 Subject: [PATCH 191/455] Dont print training in screen --- modules/flowmldetection/flowmldetection.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 1146091a92..4bb2ad7dbf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -126,15 +126,8 @@ def train(self, sum_labeled_flows): # See score so far in training score = self.clf.score(X_flow, y_flow) - # To debug the training score - # self.scores.append(score) - - self.print(f" Training Score: {score}", 0, 1) - # self.print(f' Model Parameters: {self.clf.coef_}') - - # Debug code to store a plot in a png of the scores - # plt.plot(self.scores) - # plt.savefig('train-scores.png') + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) # Store the models on disk self.store_model() From eef7992b26c5e8ff0db0ec8c14ce9bd3064f7fd6 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:28 +0000 Subject: [PATCH 192/455] Add function to write to train log --- modules/flowmldetection/flowmldetection.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4bb2ad7dbf..d4b2762f5f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -247,28 +247,28 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_training_flows(self, last_number_of_flows_when_trained): """ - Process all the flows in the DB + Process only the new flows in the DB since the last training. Store the pandas df in self.flows """ try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + # We get all the flows so far - # because this retraining happens in batches flows = self.db.get_all_flows() - # Check how many different labels are in the DB - # We need both normal and malware + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB labels = self.db.get_labels() if len(labels) == 1: - # Only 1 label has flows - # There are not enough different labels, so insert two flows - # that are fake but representative of a normal and malware flow - # they are only for the training process - # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. - flows.append( + # Insert fake flows for both classes if needed + new_flows.append( { "starttime": 1594417039.029793, "dur": "1.9424750804901123", From b253aecbdf6797bee21511fc6faa84f0dcf6dd08 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:27 +0000 Subject: [PATCH 193/455] Fix label in dummy flow --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d4b2762f5f..6a44422cc2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "ground_truth_label": "Malicious", "module_labels": { "flowalerts-long-connection": "Malicious" }, } ) - flows.append( + new_flows.append( { "starttime": 1382355032.706468, "dur": "10.896695", From 8b5dccc0afc99f5a2bd1c6175d034b890135178d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:39 +0000 Subject: [PATCH 194/455] Fix dummy flow --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6a44422cc2..20f1f8ca89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "label": "Benign", + "ground_truth_label": "Benign", "module_labels": { "flowalerts-long-connection": "Benign" }, From 11fb0096098f3ac57267593712f8b545b1ca84a2 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:58:28 +0000 Subject: [PATCH 195/455] Rename variable --- modules/flowmldetection/flowmldetection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 20f1f8ca89..59064d61a5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): }, } ) - # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(flows) + df_flows = pd.DataFrame(new_flows) # Process features df_flows = self.process_features(df_flows) @@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: - # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) From 1acb03086bc424093508484dfa70176c696f8777 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:32 +0000 Subject: [PATCH 196/455] Fix dummy flow label --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 59064d61a5..6b41b40298 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -356,6 +356,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error From 5f61978998876e7e30511a2e7a378bf914ec022a Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:47 +0000 Subject: [PATCH 197/455] Pass values to train function --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6b41b40298..4d66aab855 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -521,9 +521,9 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows() + self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train() + self.train(sum_labeled_flows) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From 4a486284e59952de7c793ee55cd2e627fd7f2830 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:01:47 +0000 Subject: [PATCH 198/455] import os --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4d66aab855..766178e127 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import os from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 19b5bdde44678c80365f8c6aeda8b9d3b67f7a6f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:15 +0000 Subject: [PATCH 199/455] Get issue of total flows zero --- slips_files/core/database/database_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf62..892b923b4a 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -661,7 +661,8 @@ def add_software_to_profile(self, *args, **kwargs): return self.rdb.add_software_to_profile(*args, **kwargs) def get_total_flows(self, *args, **kwargs): - return int(self.rdb.get_total_flows(*args, **kwargs)) + total_flows = self.rdb.get_total_flows(*args, **kwargs) + return int(total_flows) if total_flows is not None else 0 def increment_processed_flows(self, *args, **kwargs): return self.rdb.increment_processed_flows(*args, **kwargs) From cf87d4260a971d8e81d1474b0d0968dba12e68b7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:32 +0000 Subject: [PATCH 200/455] Add comments --- slips_files/core/database/database_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 892b923b4a..6dd1d9952e 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -879,7 +879,10 @@ def get_flow(self, *args, **kwargs): """returns the raw flow as read from the log file""" return self.sqlite.get_flow(*args, **kwargs) - def add_flow(self, flow, profileid: str, twid: str, label="benign"): + def add_flow(self, flow, profileid: str, twid: str, label="Benign"): + """ + Just in case, by default if there are no labels in the flow, we consider it Benign + """ # stores it in the db self.sqlite.add_flow(flow, profileid, twid, label=label) # handles the channels and labels etc. From 5a7c0ded0fcf0c46666839a155556f09409687cc Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:51 +0000 Subject: [PATCH 201/455] Rename var name to be more clear --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index 3c4d59db27..3dd478dcf2 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -119,7 +119,7 @@ def read_configuration(self): self.local_whitelist_path = conf.local_whitelist_path() self.timeformat = conf.ts_format() self.analysis_direction = conf.analysis_direction() - self.label = conf.label() + self.configuration_label = conf.label() self.width = conf.get_tw_width_as_float() self.client_ips: List[ Union[IPv4Network, IPv6Network, IPv4Address, IPv6Address] From 24e638bdba4dedacff0e2af93b701f3d1b75403e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:10 +0000 Subject: [PATCH 202/455] Rename var name --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index 3dd478dcf2..429faae5c3 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -377,7 +377,7 @@ def store_features_going_in(self, profileid: str, twid: str, flow): flow, profileid=profileid, twid=twid, - label=self.label, + label=self.configuration_label, ) self.db.mark_profile_tw_as_modified(profileid, twid, "") From f872498d1f7848c293a1c71e03b21f35b0eba1d3 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:31 +0000 Subject: [PATCH 203/455] Fix processeed flows being zero --- slips/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips/main.py b/slips/main.py index df49ffb97e..39e8b2a673 100644 --- a/slips/main.py +++ b/slips/main.py @@ -414,7 +414,7 @@ def get_analyzed_flows_percentage(self) -> str: self.total_flows = self.db.get_total_flows() flows_percentage = int( - (self.db.get_processed_flows_so_far() / self.total_flows) * 100 + (self.db.get_processed_flows_so_far() / self.total_flows) * 100 if self.total_flows != 0 else 0 ) return f"Analyzed Flows: {green(flows_percentage)}{green('%')}. " From 19c3116d79ae35e0138b623dd05d0994dcabd679 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:53 +0000 Subject: [PATCH 204/455] Delete old comments --- modules/flowmldetection/flowmldetection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 766178e127..6c3bfc1275 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -28,10 +28,6 @@ Method, ) -# Only for debbuging -# from matplotlib import pyplot as plt - - # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass From 0d6d1da5f8494e912ceb600fcc14c93c7dd36204 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:13:22 +0000 Subject: [PATCH 205/455] Fix plots --- modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 0b5b5b72ba..359df04eff 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -2,6 +2,8 @@ import matplotlib.pyplot as plt import re import sys +import argparse +import os def plot_log_data(file_path): # Read the log data from the file @@ -24,33 +26,59 @@ def plot_log_data(file_path): "Score": float }) + # Get the directory of the log file to store the plot in the same folder + dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Total labels + # Create the second y-axis for the Background, Benign, Malicious, Total labels ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() - # Save plot to a PNG file - plt.savefig('log_data_plot_with_two_scales.png') + # Adding the legend with increased space for readability + ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') + ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.75) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) # Display the plot plt.show() -# Make sure the file path is passed as an argument -if len(sys.argv) < 2: - print("Please provide the path to the log file as a parameter.") -else: - plot_log_data(sys.argv[1]) +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help + args = parser.parse_args() + + # Call the function to process the log file + plot_log_data(args.log_file) + +if __name__ == "__main__": + main() From da5d1875a5f4ce9ec016e5cfa8f41e31ed5862b5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:14:58 +0000 Subject: [PATCH 206/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 359df04eff..c7f374a7fe 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -40,18 +40,21 @@ def plot_log_data(file_path): ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Background, Benign, Malicious, Total labels + # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() From 0f3d1f5b26d0a8c25cfdfc9b758e249fa48fface Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:16:23 +0000 Subject: [PATCH 207/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index c7f374a7fe..4099c47c1e 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -42,10 +42,10 @@ def plot_log_data(file_path): # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') # Set appropriate scale for right y-axis based on the data ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) @@ -56,7 +56,7 @@ def plot_log_data(file_path): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') # Adding title and legend - plt.title('Log Data Visualization') + plt.title('Training performance') fig.tight_layout() # Adding the legend with increased space for readability From b000f176f8278d4fa86a2f4fb2d994da9813aaca Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:24:43 +0000 Subject: [PATCH 208/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 4099c47c1e..8437e968ac 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -59,12 +59,12 @@ def plot_log_data(file_path): plt.title('Training performance') fig.tight_layout() - # Adding the legend with increased space for readability - ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') - ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) + plt.subplots_adjust(right=0.7) # Save plot to the same folder as the log file plt.savefig(plot_file) From bd1f21b2101ae36b11bc5e3a866de745a8c3e2e8 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:02:34 +0000 Subject: [PATCH 209/455] Plot testing performance from a log --- .../plot_testing_performance.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 modules/flowmldetection/plot_testing_performance.py diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py new file mode 100644 index 0000000000..a38c7f0598 --- /dev/null +++ b/modules/flowmldetection/plot_testing_performance.py @@ -0,0 +1,89 @@ +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Create the plot + plt.figure(figsize=(12, 8)) + + # Plot each metric + plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') + plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') + plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') + plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') + plt.plot(F1_values, label='F1 Score', marker='o') + plt.plot(accuracy_values, label='Accuracy', marker='o') + plt.plot(precision_values, label='Precision', marker='o') + plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') + plt.plot(recall_values, label='Recall (TPR)', marker='o') + + # Add labels and title + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title('Evaluation Metrics Over Time') + + # Add a legend + plt.legend() + + # Save the plot as a PNG file + plt.savefig('metrics_plot.png') + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From fd21630441d02796cd0aae52b5e13492a2d731d0 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:04:32 +0000 Subject: [PATCH 210/455] Fix the plot --- modules/flowmldetection/plot_testing_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index a38c7f0598..fac0acd64a 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Set logarithmic scale on the y-axis + plt.yscale('log') + # Add labels and title plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title('Evaluation Metrics Over Time') + plt.ylabel('Metric Value (Log Scale)') + plt.title('Evaluation Metrics Over Time (Log Scale)') # Add a legend plt.legend() # Save the plot as a PNG file - plt.savefig('metrics_plot.png') + plt.savefig('metrics_plot_log_scale.png') plt.close() def main(): From ee0deaf2a3229c26a5c734a314878b9b0a393c01 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:12:40 +0000 Subject: [PATCH 211/455] Fix the plots --- .../plot_testing_performance.py | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index fac0acd64a..5581c72cd4 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -50,33 +50,66 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Create the plot - plt.figure(figsize=(12, 8)) + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } - # Plot each metric - plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') - plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') - plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') - plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') - plt.plot(F1_values, label='F1 Score', marker='o') - plt.plot(accuracy_values, label='Accuracy', marker='o') - plt.plot(precision_values, label='Precision', marker='o') - plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') - plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') - # Set logarithmic scale on the y-axis - plt.yscale('log') + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + +def plot_single_group(metrics_dict, output_filename): + plt.figure(figsize=(12, 8)) - # Add labels and title + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + plt.xlabel('Index') - plt.ylabel('Metric Value (Log Scale)') - plt.title('Evaluation Metrics Over Time (Log Scale)') - - # Add a legend + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') plt.legend() - # Save the plot as a PNG file - plt.savefig('metrics_plot_log_scale.png') + # Save the plot + plt.savefig(output_filename) plt.close() def main(): @@ -85,6 +118,7 @@ def main(): sys.exit(1) file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) From f9d8806d2c2035b3cb57e69a70b462cec05e5f57 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:16:50 +0000 Subject: [PATCH 212/455] Fix plot --- .../plot_testing_performance.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 5581c72cd4..8f9e12cd86 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') -def plot_single_group(metrics_dict, output_filename): + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename): # Apply log scale by default plt.yscale('log') + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Manually set more Y-ticks for better visibility + plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 + plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + plt.xlabel('Index') plt.ylabel('Metric Value') plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') From 15e37d2d67dc27f0aaabb5cb40dbc3fe397d64ec Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:22 +0000 Subject: [PATCH 213/455] Fix plots --- modules/flowmldetection/flowmldetection.py | 709 +++++---------------- 1 file changed, 143 insertions(+), 566 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6c3bfc1275..37f0761109 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,566 +1,143 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy -from sklearn.linear_model import SGDClassifier -from sklearn.preprocessing import StandardScaler -import pickle -import pandas as pd -import json -import traceback -import warnings -import os - -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( - Evidence, - ProfileID, - TimeWindow, - Attacker, - ThreatLevel, - EvidenceType, - IoCType, - Direction, - Victim, - Method, -) - -# This horrible hack is only to stop sklearn from printing those warnings -def warn(*args, **kwargs): - pass - - -warnings.warn = warn - - -class FlowMLDetection(IModule): - # Name: short name of the module. Do not use spaces - name = "Flow ML Detection" - description = ( - "Train or test a Machine Learning model to detect malicious flows" - ) - authors = ["Sebastian Garcia"] - - def init(self): - # Subscribe to the channel - self.c1 = self.db.subscribe("new_flow") - self.channels = {"new_flow": self.c1} - self.fieldseparator = self.db.get_field_separator() - # Set the output queue of our database instance - # Read the configuration - self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 - # The scaler trained during training and to use during testing - self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - - def read_configuration(self): - conf = ConfigParser() - self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - - def train(self, sum_labeled_flows): - """ - Train a model based on the flows we receive and the labels - """ - try: - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels - X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - - # Normalize this batch of data so far. This can get progressivle slow - X_flow = self.scaler.fit_transform(X_flow) - - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - - # Train - try: - # Online incremental learning - self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] - ) - except Exception: - self.print("Error while calling clf.train()") - self.print(traceback.format_exc(), 0, 1) - - # See score so far in training - score = self.clf.score(X_flow, y_flow) - - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) - - # Store the models on disk - self.store_model() - - # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") - except Exception: - self.print("Error in train().", 0, 1) - self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") - - def process_features(self, dataset): - """ - Discards some features of the dataset and can create new. - Clean the dataset - """ - try: - # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] - for proto in to_discard: - dataset = dataset[dataset.proto != proto] - - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these - to_drop = [ - "appproto", - "daddr", - "saddr", - "starttime", - "type_", - "smac", - "dmac", - "history", - "uid", - "dir_", - "endtime", - "flow_source", - ] - for field in to_drop: - try: - dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): - pass - - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others - # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) - - # Convert state to categorical - dataset.state = dataset.state.str.replace( - r"(^.*Not Established.*$)", "0", regex=True - ) - dataset.state = dataset.state.str.replace( - r"(^.*Established.*$)", "1", regex=True - ) - - # Convert categories to floats - dataset.state = dataset.state.astype("float64") - - # Convert proto to categorical. For now we only have few states, so we can hardcode... - # We dont use the data to create categories because in testing mode - # we dont see all the protocols - # Also we dont store the Categorizer because the user can retrain - # with its own data. - dataset.proto = dataset.proto.str.lower() - dataset.proto = dataset.proto.str.replace( - r"(^.*tcp.*$)", "0", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*udp.*$)", "1", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp.*$)", "2", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp-ipv6.*$)", "3", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*arp.*$)", "4", regex=True - ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_float: - try: - field = field.astype("float64") - except (ValueError, AttributeError): - pass - - return dataset - except Exception: - # Stop the timer - self.print("Error in process_features()") - self.print(traceback.format_exc(), 0, 1) - - def process_training_flows(self, last_number_of_flows_when_trained): - """ - Process only the new flows in the DB since the last training. - Store the pandas df in self.flows - """ - try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - - # We get all the flows so far - flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB - labels = self.db.get_labels() - if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( - { - "starttime": 1594417039.029793, - "dur": "1.9424750804901123", - "saddr": "10.7.10.101", - "sport": "49733", - "daddr": "40.70.224.145", - "dport": "443", - "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, - "sbytes": 25517, - "dbytes": 17247, - "appproto": "ssl", - "ground_truth_label": "Malicious", - "module_labels": { - "flowalerts-long-connection": "Malicious" - }, - } - ) - new_flows.append( - { - "starttime": 1382355032.706468, - "dur": "10.896695", - "saddr": "147.32.83.52", - "sport": "47956", - "daddr": "80.242.138.72", - "dport": "80", - "proto": "tcp", - "state": "SF", - "spkts": 1, - "dpkts": 0, - "sbytes": 100, - "dbytes": 67596, - "appproto": "http", - "ground_truth_label": "Benign", - "module_labels": { - "flowalerts-long-connection": "Benign" - }, - } - ) - - # Convert to pandas df - df_flows = pd.DataFrame(new_flows) - - # Process features - df_flows = self.process_features(df_flows) - - # Update the flow to the processed version - self.flows = df_flows - except Exception: - self.print("Error in process_flows()") - self.print(traceback.format_exc(), 0, 1) - - def process_flow(self, flow_to_process: dict): - """ - Process one flow. Only used during detection in testing - returns the pandas df with the processed flow - """ - try: - # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) - dflow = self.process_features(raw_flow) - if dflow.empty: - return None - # Update the flow to the processed version - return dflow - except Exception: - # Stop the timer - self.print("Error in process_flow()") - self.print(traceback.format_exc(), 0, 1) - - def detect(self, x_flow) -> Optional[numpy.ndarray]: - """ - Detects the given flow with the current model stored - and returns the predection array - """ - try: - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "endtime", - "flow_source", - "ground_truth_label", - "detailed_ground_truth_label", - ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass - # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) - return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" - ) - self.print(traceback.format_exc(), 0, 1) - - def store_model(self): - """ - Store the trained model on disk - """ - self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: - data = pickle.dumps(self.clf) - f.write(data) - with open(self.scaler_path, "wb") as g: - data = pickle.dumps(self.scaler) - g.write(data) - - def read_model(self): - """ - Read the trained model from disk - """ - try: - self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: - self.clf = pickle.load(f) - self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: - self.scaler = pickle.load(g) - except FileNotFoundError: - # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - except EOFError: - self.print( - "Error reading model from disk. " - "Creating a new empty model.", - 0, - 2, - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - - def set_evidence_malicious_flow(self, flow: dict, twid: str): - confidence: float = 0.1 - description = ( - f"Flow with malicious characteristics by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" - ) - twid_number = int(twid.replace("timewindow", "")) - evidence: Evidence = Evidence( - evidence_type=EvidenceType.MALICIOUS_FLOW, - attacker=Attacker( - direction=Direction.SRC, - ioc_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - ioc_type=IoCType.IP, - value=flow["daddr"], - ), - threat_level=ThreatLevel.LOW, - confidence=confidence, - description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], - timestamp=flow["starttime"], - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], - ) - - self.db.set_evidence(evidence) - - def shutdown_gracefully(self): - # Confirm that the module is done processing - if self.mode == "train": - self.store_model() - - def pre_main(self): - utils.drop_root_privs() - # Load the model - self.read_model() - - def main(self): - if msg := self.get_msg("new_flow"): - # When a new flow arrives - msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] - self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "state": msg["interpreted_state"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) - - if self.mode == "train": - # We are training - - # Is the amount in the DB of labels enough to retrain? - # Use labeled flows - labels = self.db.get_labels() - sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows) - self.last_number_of_flows_when_trained = sum_labeled_flows - - elif self.mode == "test": - # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) - - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: - # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return - - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) - if pred[0] == "Malicious": - # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) - self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 2, - ) +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } + + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): + plt.figure(figsize=(12, 8)) + + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Add more ticks between 0 and 1 (using a logarithmic scale) + plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) + + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.legend() + + # Save the plot + plt.savefig(output_filename) + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 9ddaf31f83a34962af33188b0f88176dc8ec33fd Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:52 +0000 Subject: [PATCH 214/455] Fix plots --- .../plot_testing_performance.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 8f9e12cd86..69b8c96a8c 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['MCC'].append(MCC_values[i]) close_to_1['recall'].append(recall_values[i]) - # Plot metrics for values close to 0 + # Plot metrics for values close to 0 (linear scale) plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + # Plot metrics for values close to 1 (log scale) + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): if 'recall' in metrics_dict: plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - # Apply log scale by default - plt.yscale('log') + # If the plot is close to 1, apply log scale + if not is_close_to_0: + plt.yscale('log') - # If the plot is close to 0, set custom ticks + # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series if is_close_to_0: - # Manually set more Y-ticks for better visibility - plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 - plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) + max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) + + # Avoid log(0), so set the minimum limit a little higher than zero + if min_val == 0: + min_val = 1e-4 # Avoid zero values on the logarithmic scale + + plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 878812adb8ffbdb24c82525a2b45580dd2aad4d5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:23:02 +0000 Subject: [PATCH 215/455] Fix plots --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 69b8c96a8c..de4ada38b3 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale + min_val = 1e-8 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From b1909a50ed00fe86cebd6b037556ee7f5a419403 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:25:58 +0000 Subject: [PATCH 216/455] Change plot names --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index de4ada38b3..1b4152c6eb 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") From 213b6a5b6597b8b568ee45755d44b5e334c668b7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:26:09 +0000 Subject: [PATCH 217/455] Rename file --- .../{plot_train_score.py => plot_train_performance.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename modules/flowmldetection/{plot_train_score.py => plot_train_performance.py} (97%) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_performance.py similarity index 97% rename from modules/flowmldetection/plot_train_score.py rename to modules/flowmldetection/plot_train_performance.py index 8437e968ac..80e13e9515 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -28,7 +28,7 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + plot_file = os.path.join(dir_name, 'performance_metrics_training.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) From 20db5dbd1db02d06af5a6a9d7b6bb27e0e40a66f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:32 +0000 Subject: [PATCH 218/455] Recover good flowmldetection deleted by mistake --- modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++----- 1 file changed, 566 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 37f0761109..5e4e9aa462 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,143 +1,566 @@ -import matplotlib.pyplot as plt -import sys -import numpy as np - -def process_file(file_path): - # Initialize the counters for the values - FPR_values = [] - FNR_values = [] - TNR_values = [] - TPR_values = [] - F1_values = [] - accuracy_values = [] - precision_values = [] - MCC_values = [] - recall_values = [] - - # Read the file and extract the data - with open(file_path, 'r') as file: - for line in file: - if "TP:" in line: - # Extract the values from the line - parts = line.split(',') - TP = int(parts[0].split(':')[1].strip()) - TN = int(parts[1].split(':')[1].strip()) - FP = int(parts[2].split(':')[1].strip()) - FN = int(parts[3].split(':')[1].strip()) - - # Calculate metrics - FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 - FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 - TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 - TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 - Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 - Recall = TPR # Recall is the same as TPR - F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 - Accuracy = (TP + TN) / (TP + TN + FP + FN) - MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 - - # Append the values to the respective lists - FPR_values.append(FPR) - FNR_values.append(FNR) - TNR_values.append(TNR) - TPR_values.append(TPR) - F1_values.append(F1) - accuracy_values.append(Accuracy) - precision_values.append(Precision) - MCC_values.append(MCC) - recall_values.append(Recall) - - return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values - -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') - - # Print the final values - print("\nFinal Metric Values:") - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): - plt.figure(figsize=(12, 8)) - - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # Apply log scale by default - plt.yscale('log') - - # If the plot is close to 0, set custom ticks - if is_close_to_0: - # Add more ticks between 0 and 1 (using a logarithmic scale) - plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) - - plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') - plt.legend() - - # Save the plot - plt.savefig(output_filename) - plt.close() - -def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - file_path = sys.argv[1] - - FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) - -if __name__ == "__main__": - main() +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler +import pickle +import pandas as pd +import json +import traceback +import warnings +import os + +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.evidence import ( + Evidence, + ProfileID, + TimeWindow, + Attacker, + ThreatLevel, + EvidenceType, + IoCType, + Direction, + Victim, + Method, +) + +# This horrible hack is only to stop sklearn from printing those warnings +def warn(*args, **kwargs): + pass + + +warnings.warn = warn + + +class FlowMLDetection(IModule): + # Name: short name of the module. Do not use spaces + name = "Flow ML Detection" + description = ( + "Train or test a Machine Learning model to detect malicious flows" + ) + authors = ["Sebastian Garcia"] + + def init(self): + # Subscribe to the channel + self.c1 = self.db.subscribe("new_flow") + self.channels = {"new_flow": self.c1} + self.fieldseparator = self.db.get_field_separator() + # Set the output queue of our database instance + # Read the configuration + self.read_configuration() + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained. Used internally only to know + # when to retrain + self.last_number_of_flows_when_trained = 0 + # The scaler trained during training and to use during testing + self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" + + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + + def read_configuration(self): + conf = ConfigParser() + self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves + self.label = conf.label() + + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): + """ + Train a model based on the flows we receive and the labels + """ + try: + # Create X_flow with the current flows minus the label + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) + # Drop the module_labels + X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) + + # Normalize this batch of data so far. This can get progressivle slow + X_flow = self.scaler.fit_transform(X_flow) + + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } + + # Train + try: + # Online incremental learning + self.clf.partial_fit( + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + ) + except Exception: + self.print("Error while calling clf.train()") + self.print(traceback.format_exc(), 0, 1) + + # See score so far in training + score = self.clf.score(X_flow, y_flow) + + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + + # Store the models on disk + self.store_model() + + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + except Exception: + self.print("Error in train().", 0, 1) + self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") + + def process_features(self, dataset): + """ + Discards some features of the dataset and can create new. + Clean the dataset + """ + try: + # Discard some type of flows that dont have ports + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + for proto in to_discard: + dataset = dataset[dataset.proto != proto] + + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + + # For now, discard these + to_drop = [ + "appproto", + "daddr", + "saddr", + "starttime", + "type_", + "smac", + "dmac", + "history", + "uid", + "dir_", + "endtime", + "flow_source", + ] + for field in to_drop: + try: + dataset = dataset.drop(field, axis=1) + except (ValueError, KeyError): + pass + + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others + # So transform here + dataset["state"] = dataset.apply( + lambda row: self.db.get_final_state_from_flags( + row["state"], (row["spkts"] + row["dpkts"]) + ), + axis=1, + ) + + # Convert state to categorical + dataset.state = dataset.state.str.replace( + r"(^.*Not Established.*$)", "0", regex=True + ) + dataset.state = dataset.state.str.replace( + r"(^.*Established.*$)", "1", regex=True + ) + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... + # We dont use the data to create categories because in testing mode + # we dont see all the protocols + # Also we dont store the Categorizer because the user can retrain + # with its own data. + dataset.proto = dataset.proto.str.lower() + dataset.proto = dataset.proto.str.replace( + r"(^.*tcp.*$)", "0", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*udp.*$)", "1", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp.*$)", "2", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp-ipv6.*$)", "3", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*arp.*$)", "4", regex=True + ) + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + + return dataset + except Exception: + # Stop the timer + self.print("Error in process_features()") + self.print(traceback.format_exc(), 0, 1) + + def process_training_flows(self, last_number_of_flows_when_trained): + """ + Process only the new flows in the DB since the last training. + Store the pandas df in self.flows + """ + try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + + # We get all the flows so far + flows = self.db.get_all_flows() + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB + labels = self.db.get_labels() + if len(labels) == 1: + # Insert fake flows for both classes if needed + new_flows.append( + { + "starttime": 1594417039.029793, + "dur": "1.9424750804901123", + "saddr": "10.7.10.101", + "sport": "49733", + "daddr": "40.70.224.145", + "dport": "443", + "proto": "tcp", + "state": "SF", + "spkts": 17, + "dpkts": 27, + "sbytes": 25517, + "dbytes": 17247, + "appproto": "ssl", + "ground_truth_label": "Malicious", + "module_labels": { + "flowalerts-long-connection": "Malicious" + }, + } + ) + new_flows.append( + { + "starttime": 1382355032.706468, + "dur": "10.896695", + "saddr": "147.32.83.52", + "sport": "47956", + "daddr": "80.242.138.72", + "dport": "80", + "proto": "tcp", + "state": "SF", + "spkts": 1, + "dpkts": 0, + "sbytes": 100, + "dbytes": 67596, + "appproto": "http", + "ground_truth_label": "Benign", + "module_labels": { + "flowalerts-long-connection": "Benign" + }, + } + ) + + # Convert to pandas df + df_flows = pd.DataFrame(new_flows) + + # Process features + df_flows = self.process_features(df_flows) + + # Update the flow to the processed version + self.flows = df_flows + except Exception: + self.print("Error in process_flows()") + self.print(traceback.format_exc(), 0, 1) + + def process_flow(self, flow_to_process: dict): + """ + Process one flow. Only used during detection in testing + returns the pandas df with the processed flow + """ + try: + # Convert the flow to a pandas dataframe + raw_flow = pd.DataFrame(flow_to_process, index=[0]) + dflow = self.process_features(raw_flow) + if dflow.empty: + return None + # Update the flow to the processed version + return dflow + except Exception: + # Stop the timer + self.print("Error in process_flow()") + self.print(traceback.format_exc(), 0, 1) + + def detect(self, x_flow) -> Optional[numpy.ndarray]: + """ + Detects the given flow with the current model stored + and returns the predection array + """ + try: + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", + ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass + # Scale the flow + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) + return pred + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) + self.print(traceback.format_exc(), 0, 1) + + def store_model(self): + """ + Store the trained model on disk + """ + self.print("Storing the trained model and scaler on disk.", 0, 2) + with open(self.model_path, "wb") as f: + data = pickle.dumps(self.clf) + f.write(data) + with open(self.scaler_path, "wb") as g: + data = pickle.dumps(self.scaler) + g.write(data) + + def read_model(self): + """ + Read the trained model from disk + """ + try: + self.print("Reading the trained model from disk.", 0, 2) + with open(self.model_path, "rb") as f: + self.clf = pickle.load(f) + self.print("Reading the trained scaler from disk.", 0, 2) + with open(self.scaler_path, "rb") as g: + self.scaler = pickle.load(g) + except FileNotFoundError: + # If there is no model, create one empty + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + except EOFError: + self.print( + "Error reading model from disk. " + "Creating a new empty model.", + 0, + 2, + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + + def set_evidence_malicious_flow(self, flow: dict, twid: str): + confidence: float = 0.1 + description = ( + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" + ) + twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( + evidence_type=EvidenceType.MALICIOUS_FLOW, + attacker=Attacker( + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], + ), + threat_level=ThreatLevel.LOW, + confidence=confidence, + description=description, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], + ) + + self.db.set_evidence(evidence) + + def shutdown_gracefully(self): + # Confirm that the module is done processing + if self.mode == "train": + self.store_model() + + def pre_main(self): + utils.drop_root_privs() + # Load the model + self.read_model() + + def main(self): + if msg := self.get_msg("new_flow"): + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) + + if self.mode == "train": + # We are training + + # Is the amount in the DB of labels enough to retrain? + # Use labeled flows + labels = self.db.get_labels() + sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. + if ( + sum_labeled_flows >= self.minimum_labels_to_start_train + ): + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows(self.last_number_of_flows_when_trained) + # Train an algorithm + self.train(sum_labeled_flows) + self.last_number_of_flows_when_trained = sum_labeled_flows + + elif self.mode == "test": + # We are testing, which means using the model to detect + processed_flow = self.process_flow(self.flow) + + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: + # Predict + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return + + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malicious": + # Generate an alert + self.set_evidence_malicious_flow(self.flow, self.twid) + self.print( + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 2, + ) \ No newline at end of file From 01a1a6156e0d0626e327d683cb828d44475e9eab Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:43 +0000 Subject: [PATCH 219/455] Fix plot test --- modules/flowmldetection/plot_testing_performance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 1b4152c6eb..977a68b2d5 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-8 # Avoid zero values on the logarithmic scale + min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 0b51f71948efe37e361836cb04bfcedba58dad66 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:50:33 +0000 Subject: [PATCH 220/455] Add testing code to evaluate performance. It is optional with a varible --- modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5e4e9aa462..b17a1baaf0 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -526,36 +526,21 @@ def main(self): elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: + original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) if pred[0] == "Malicious": # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f"Prediction {pred[0]} for label {label}" + f"Prediction {pred[0]} for label {original_label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -563,4 +548,43 @@ def main(self): f'{self.flow["proto"]}', 0, 2, - ) \ No newline at end of file + ) + + # So you can disable this code easily. Since it is used only for evaluating a testing + log_testing_data = True + if log_testing_data: + # Initialize counters if not already done + if not hasattr(self, 'tp'): + self.tp = 0 + if not hasattr(self, 'tn'): + self.tn = 0 + if not hasattr(self, 'fp'): + self.fp = 0 + if not hasattr(self, 'fn'): + self.fn = 0 + + + # Update counters based on predictions and labels + if pred[0] == "Malicious" and original_label == "Malicious": + self.tp += 1 + elif pred[0] == "Benign" and original_label == "Benign": + self.tn += 1 + elif pred[0] == "Malicious" and original_label == "Benign": + self.fp += 1 + elif pred[0] == "Benign" and original_label == "Malicious": + self.fn += 1 + + testing_log_path = "./modules/flowmldetection/testing_performance.log" + try: + with open(testing_log_path, "a") as log_file: + log_file.write("Testing Performance Log Initialized\n") + # Log the testing performance metrics + log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") + + # Log the original flow for false positives and false negatives + if pred[0] == "Malicious" and original_label == "Benign": + log_file.write(f"False Positive Flow: {self.flow}\n") + elif pred[0] == "Benign" and original_label == "Malicious": + log_file.write(f"False Negative Flow: {self.flow}\n") + except Exception as e: + self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From e2da4cbde7d3b54ce2e90749bcd9e4c7bdbb8be2 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:04:00 +0000 Subject: [PATCH 221/455] Fix plots --- .../plot_testing_performance.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 977a68b2d5..6865415cdf 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import sys import numpy as np +import argparse def process_file(file_path): # Initialize the counters for the values @@ -49,7 +50,7 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): # Separate the values into two groups based on their proximity to 0 or 1 close_to_0 = { 'FPR': [], 'FNR': [] @@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) # Print the final values - print("\nFinal Metric Values:") + print("\nFinal Metric Values for Experiment", experiment_number) print(f"Final FPR: {FPR_values[-1]:.4f}") print(f"Final FNR: {FNR_values[-1]:.4f}") print(f"Final TNR: {TNR_values[-1]:.4f}") @@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu print(f"Final MCC: {MCC_values[-1]:.4f}") print(f"Final Recall: {recall_values[-1]:.4f}") -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): +def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + # Add the experiment number to the plot title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') plt.legend() # Save the plot @@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.close() def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) + # Set up argument parsing + parser = argparse.ArgumentParser(description='Plot testing performance metrics.') + parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') + parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') + + args = parser.parse_args() - file_path = sys.argv[1] + file_path = args.file + experiment_number = args.experiment FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) if __name__ == "__main__": main() From e174fc4574b68e1aa2dedfdab223d3b42c60f282 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:14:51 +0000 Subject: [PATCH 222/455] Fix train plot --- .../flowmldetection/plot_train_performance.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 80e13e9515..244df13d28 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -5,7 +5,7 @@ import argparse import os -def plot_log_data(file_path): +def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() @@ -28,7 +28,8 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + # Append experiment number to the filename + plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) @@ -55,18 +56,18 @@ def plot_log_data(file_path): for i, value in enumerate(df["Total labels"]): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - # Adding title and legend - plt.title('Training performance') + # Adding title and legend with experiment number in title + plt.title(f'Training performance - Experiment {experiment_number}') fig.tight_layout() # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) + plt.subplots_adjust(right=0.75) - # Save plot to the same folder as the log file + # Save plot to the same folder as the log file with experiment number in filename plt.savefig(plot_file) # Display the plot @@ -75,13 +76,14 @@ def plot_log_data(file_path): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") + parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") # Handle -h / --help args = parser.parse_args() # Call the function to process the log file - plot_log_data(args.log_file) + plot_log_data(args.file, args.experiment) if __name__ == "__main__": main() From e7fdbfdbd1b5c3de8bb60227c4e02454abe5c993 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:14:48 +0000 Subject: [PATCH 223/455] Fix plots --- .../flowmldetection/plot_train_performance.py | 122 ++++++++++-------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 244df13d28..5212dfeeaf 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,85 +4,105 @@ import sys import argparse import os +import matplotlib.ticker as ticker def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + # Regex pattern for the new log format + pattern = ( + r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " + r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " + r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." + ) # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + columns = [ + "Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" + ] + df = pd.DataFrame(data, columns=columns) df = df.astype({ + "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "Total labels": float, - "Score": float + "FPR": float, + "TNR": float, + "TPR": float, + "FNR": float, + "F1": float, + "Precision": float, + "Accuracy": float, + "MCC": float, + "Recall": float, }) - # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - # Append experiment number to the filename - plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + # --- Plot 1: Number of labels (linear scale, no total labels) --- + fig1, ax1 = plt.subplots(figsize=(10, 6)) + ax1.plot(df.index, df["Background"], label="Background", color='black') + ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') + ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend with experiment number in title - plt.title(f'Training performance - Experiment {experiment_number}') - fig.tight_layout() + ax1.set_ylabel('Label Counts') + # No log scale here + ax1.set_title(f'Label Counts - Experiment {experiment_number}') + ax1.legend() + ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + + # --- Plot 2: FNR and FPR (log scale) --- + fig2, ax2 = plt.subplots(figsize=(10, 6)) + ax2.plot(df.index, df["FNR"], label="FNR", color='red') + ax2.plot(df.index, df["FPR"], label="FPR", color='blue') + ax2.set_xlabel('Index') + ax2.set_ylabel('Rate') + ax2.set_yscale('log') + ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') + ax2.legend() + ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + + # --- Plot 3: Other metrics (log scale) --- + fig3, ax3 = plt.subplots(figsize=(12, 7)) + metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] + colors_rest = [ + 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', + 'tab:gray', 'tab:pink', 'tab:olive' + ] + for metric, color in zip(metrics_rest, colors_rest): + ax3.plot(df.index, df[metric], label=metric, color=color) + ax3.set_xlabel('Index') + ax3.set_ylabel('Metric Value') + ax3.set_yscale('log') + ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') + ax3.legend() + ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) - - # Save plot to the same folder as the log file with experiment number in filename - plt.savefig(plot_file) - - # Display the plot plt.show() + # --- Print final values in terminal --- + print("\nFinal values at last training step:") + for col in ["Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: + print(f"{col}: {df[col].iloc[-1]}") + def main(): - # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") - - # Handle -h / --help args = parser.parse_args() - - # Call the function to process the log file plot_log_data(args.file, args.experiment) if __name__ == "__main__": From fdbbbb5e9b127117ca089dab05bd1fe49f4e5508 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:16:01 +0000 Subject: [PATCH 224/455] Add performance metrics to the training evaluation --- modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++----- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b17a1baaf0..2c60cd4034 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,16 @@ import json import traceback import warnings -import os +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import ( + confusion_matrix, + f1_score, + precision_score, + accuracy_score, + matthews_corrcoef, + recall_score, +) + from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -86,21 +95,21 @@ def write_to_training_log(self, message: str): except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows): + def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ Train a model based on the flows we receive and the labels """ try: + # Create y_flow with the label + y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - # Normalize this batch of data so far. This can get progressivle slow + # Normalize this batch of data so far. This can get progressively slow X_flow = self.scaler.fit_transform(X_flow) # Count the number of labels of each type in this epoc @@ -120,18 +129,43 @@ def train(self, sum_labeled_flows): self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # See score so far in training - score = self.clf.score(X_flow, y_flow) + # Predict on the training data + y_pred = self.clf.predict(X_flow) - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + # For metrics, let's focus on Malicious vs Benign (ignore Background) + mask = (y_flow == "Malicious") | (y_flow == "Benign") + y_true_bin = y_flow[mask] + y_pred_bin = y_pred[mask] + + # Map to binary: Malicious=1, Benign=0 + y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) + y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + + # Compute confusion matrix: tn, fp, fn, tp + tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + + # Compute metrics + FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 + TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 + TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 + FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 + F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) + PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) + ACCU = accuracy_score(y_true_bin, y_pred_bin) + MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk self.store_model() # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + self.write_to_training_log( + f"Total labels: {sum_labeled_flows}, " + f"Background: {epoch_label_counts['Background']}. " + f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + ) except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) @@ -520,7 +554,7 @@ def main(self): # for pandas self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train(sum_labeled_flows) + self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From b7b2477f4939479d223c699e240cf3f6a33d2c10 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sun, 4 May 2025 12:50:46 +0000 Subject: [PATCH 225/455] Fix experiment names --- modules/flowmldetection/plot_train_performance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 5212dfeeaf..304f0f4ead 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number): ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') ax1.set_ylabel('Label Counts') - # No log scale here ax1.set_title(f'Label Counts - Experiment {experiment_number}') ax1.legend() ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) # --- Plot 2: FNR and FPR (log scale) --- fig2, ax2 = plt.subplots(figsize=(10, 6)) @@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number): ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') ax2.legend() ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) # --- Plot 3: Other metrics (log scale) --- fig3, ax3 = plt.subplots(figsize=(12, 7)) @@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number): ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') ax3.legend() ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) plt.show() From 27b2b567ea395023664434d1bbb11819e3625776 Mon Sep 17 00:00:00 2001 From: alya Date: Mon, 5 May 2025 15:24:12 +0300 Subject: [PATCH 226/455] test_profiler: update unit tests --- tests/test_profiler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_profiler.py b/tests/test_profiler.py index b967c7880f..e62bdd8e74 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -481,7 +481,6 @@ def test_read_configuration( mock_conf.local_whitelist_path.return_value = "path/to/whitelist" mock_conf.ts_format.return_value = "unixtimestamp" mock_conf.analysis_direction.return_value = "all" - mock_conf.label.return_value = "malicious" mock_conf.get_tw_width_as_float.return_value = 1.0 mock_conf.client_ips.return_value = ["192.168.1.1", "10.0.0.1"] @@ -490,7 +489,6 @@ def test_read_configuration( assert profiler.local_whitelist_path == "path/to/whitelist" assert profiler.timeformat == "unixtimestamp" assert profiler.analysis_direction == "all" - assert profiler.label == "malicious" assert profiler.width == 1.0 assert profiler.client_ips == ["192.168.1.1", "10.0.0.1"] From 2c9fea74846d842820fa227c36742c1f91eb153e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Mon, 5 May 2025 16:43:05 +0000 Subject: [PATCH 227/455] Fix that the training and testing logs files were appened instead of rewritten --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2c60cd4034..9a920b4e25 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -90,7 +90,7 @@ def write_to_training_log(self, message: str): Write a message to the training log file. """ try: - with open(self.training_log_path, "a") as log_file: + with open(self.training_log_path, "w") as log_file: log_file.write(message + "\n") except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) @@ -610,8 +610,7 @@ def main(self): testing_log_path = "./modules/flowmldetection/testing_performance.log" try: - with open(testing_log_path, "a") as log_file: - log_file.write("Testing Performance Log Initialized\n") + with open(testing_log_path, "w") as log_file: # Log the testing performance metrics log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") From f5b28994ab20da76a77c42ebea793d31f81d9850 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Mon, 5 May 2025 22:45:16 +0000 Subject: [PATCH 228/455] Fix an issue of storing the new log files --- modules/flowmldetection/flowmldetection.py | 49 ++++++++++------------ 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9a920b4e25..9139066f08 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -72,11 +72,19 @@ def init(self): self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") + self.init_log_file() + + def init_log_file(self): + """ + Init the log file for training or testing + """ + if self.mode == "train": + # Initialize the training log file + self.log_path = "./modules/flowmldetection/training.log" + elif self.mode == "test": + # Initialize the testing log file + self.log_path = "./modules/flowmldetection/testing.log" + self.log_file = open(self.log_path, "w") def read_configuration(self): conf = ConfigParser() @@ -85,15 +93,14 @@ def read_configuration(self): # in case the flows do not have a label themselves self.label = conf.label() - def write_to_training_log(self, message: str): + def write_to_log(self, message: str): """ - Write a message to the training log file. + Write a message to the local log file. """ try: - with open(self.training_log_path, "w") as log_file: - log_file.write(message + "\n") + self.log_file.write(message + "\n") except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) + self.print(f"Error writing to log: {e}", 0, 1) def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ @@ -159,7 +166,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.store_model() # Log training information - self.write_to_training_log( + self.write_to_log( f"Total labels: {sum_labeled_flows}, " f"Background: {epoch_label_counts['Background']}. " f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " @@ -169,7 +176,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") + self.write_to_log("Error occurred during training.") def process_features(self, dataset): """ @@ -597,7 +604,6 @@ def main(self): if not hasattr(self, 'fn'): self.fn = 0 - # Update counters based on predictions and labels if pred[0] == "Malicious" and original_label == "Malicious": self.tp += 1 @@ -605,19 +611,10 @@ def main(self): self.tn += 1 elif pred[0] == "Malicious" and original_label == "Benign": self.fp += 1 + self.write_to_log(f"False Positive Flow: {self.flow}") elif pred[0] == "Benign" and original_label == "Malicious": self.fn += 1 + self.write_to_log(f"False Negative Flow: {self.flow}") - testing_log_path = "./modules/flowmldetection/testing_performance.log" - try: - with open(testing_log_path, "w") as log_file: - # Log the testing performance metrics - log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") - - # Log the original flow for false positives and false negatives - if pred[0] == "Malicious" and original_label == "Benign": - log_file.write(f"False Positive Flow: {self.flow}\n") - elif pred[0] == "Benign" and original_label == "Malicious": - log_file.write(f"False Negative Flow: {self.flow}\n") - except Exception as e: - self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file + # Log the testing performance metrics + self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}") \ No newline at end of file From 1e6d0d18f18e03a35ba414072ad58c4d033b4383 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:21:08 +0300 Subject: [PATCH 229/455] enable/ disable training and testing.log with a param in the config file --- .secrets.baseline | 6 +- config/slips.yaml | 3 + modules/flowmldetection/flowmldetection.py | 140 +++++++++++++------- modules/riskiq/riskiq.py | 2 +- modules/update_manager/update_manager.py | 2 +- slips_files/common/parsers/config_parser.py | 7 +- 6 files changed, 109 insertions(+), 51 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 37fe2abcba..aa5615109c 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -149,14 +149,14 @@ "filename": "config/slips.yaml", "hashed_secret": "4cac50cee3ad8e462728e711eac3e670753d5016", "is_verified": false, - "line_number": 224 + "line_number": 226 }, { "type": "Secret Keyword", "filename": "config/slips.yaml", "hashed_secret": "d033e22ae348aeb5660fc2140aec35850c4da997", "is_verified": false, - "line_number": 394 + "line_number": 396 } ], "dataset/test14-malicious-zeek-dir/http.log": [ @@ -7192,5 +7192,5 @@ } ] }, - "generated_at": "2025-02-13T22:47:52Z" + "generated_at": "2025-05-10T13:18:46Z" } diff --git a/config/slips.yaml b/config/slips.yaml index 8736eaf511..dabb388c09 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -214,6 +214,9 @@ flowmldetection: # You should have trained at least once with 'Normal' data and once with # 'Malicious' data in order for the test to work. mode: test + # creates an extra log file called training.log/testing.log in the + # ouptput dir with performance metrics depending on the mode. + create_performance_metrics_log_files: False ############################# virustotal: diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9139066f08..2a515d0cfa 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,9 +10,8 @@ import json import traceback import warnings -from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import confusion_matrix from sklearn.metrics import ( - confusion_matrix, f1_score, precision_score, accuracy_score, @@ -37,6 +36,7 @@ Method, ) + # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass @@ -73,7 +73,7 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" self.init_log_file() - + def init_log_file(self): """ Init the log file for training or testing @@ -92,11 +92,16 @@ def read_configuration(self): # This is the global label in the configuration, # in case the flows do not have a label themselves self.label = conf.label() + self.enable_logs: bool = conf.create_performance_metrics_log_files() def write_to_log(self, message: str): """ - Write a message to the local log file. + Write a message to the local log file if + create_performance_metrics_log_files is enabled in slips.yaml """ + if not self.enable_logs: + return + try: self.log_file.write(message + "\n") except Exception as e: @@ -108,7 +113,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ try: # Create y_flow with the label - y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) + y_flow = numpy.full( + self.flows.shape[0], self.flows.ground_truth_label + ) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels @@ -130,7 +137,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): try: # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + X_flow, + y_flow, + classes=["Background", "Malicious", "Benign"], ) except Exception: self.print("Error while calling clf.train()") @@ -149,7 +158,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) # Compute confusion matrix: tn, fp, fn, tp - tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + tn, fp, fn, tp = ( + confusion_matrix(y_true_bin, y_pred_bin, labels=[0, 1]).ravel() + if len(set(y_true_bin)) > 1 + else (0, 0, 0, 0) + ) # Compute metrics FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 @@ -159,7 +172,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) ACCU = accuracy_score(y_true_bin, y_pred_bin) - MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + MCC = ( + matthews_corrcoef(y_true_bin, y_pred_bin) + if len(set(y_true_bin)) > 1 + else 0 + ) RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk @@ -189,7 +206,8 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + # If te proto is in the list to delete and there is only one flow, + # then the dataset will be empty if dataset.empty: # DataFrame is empty now, so return empty return dataset @@ -295,7 +313,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): if last_number_of_flows_when_trained is None: last_number_of_flows_when_trained = 0 else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + last_number_of_flows_when_trained = int( + last_number_of_flows_when_trained + ) # We get all the flows so far flows = self.db.get_all_flows() @@ -399,21 +419,21 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' + """ [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + """ # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -540,17 +560,19 @@ def main(self): labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - # The min labels to retrain is the min number of flows + # The min labels to retrain is the min number of flows # we should have seen so far in this capture to start training # This is so we dont _start_ training with only 1 flow - # Once we are over the start minimum, the second condition is + # Once we are over the start minimum, the second condition is # to force to retrain every a minimum_labels_to_retrain number # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + if sum_labeled_flows >= self.minimum_labels_to_start_train: + if ( + sum_labeled_flows + - self.last_number_of_flows_when_trained + >= self.minimum_labels_to_retrain + ): # So for example we retrain every 50 labels and only when # we have at least 50 labels self.print( @@ -559,10 +581,17 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) + self.process_training_flows( + self.last_number_of_flows_when_trained + ) # Train an algorithm - self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) - self.last_number_of_flows_when_trained = sum_labeled_flows + self.train( + sum_labeled_flows, + self.last_number_of_flows_when_trained, + ) + self.last_number_of_flows_when_trained = ( + sum_labeled_flows + ) elif self.mode == "test": # We are testing, which means using the model to detect @@ -570,7 +599,9 @@ def main(self): # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: - original_label = processed_flow["ground_truth_label"].iloc[0] + original_label = processed_flow["ground_truth_label"].iloc[ + 0 + ] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: @@ -591,30 +622,49 @@ def main(self): 2, ) - # So you can disable this code easily. Since it is used only for evaluating a testing + # So you can disable this code easily. Since it is used + # only for evaluating a testing log_testing_data = True if log_testing_data: # Initialize counters if not already done - if not hasattr(self, 'tp'): + if not hasattr(self, "tp"): self.tp = 0 - if not hasattr(self, 'tn'): + if not hasattr(self, "tn"): self.tn = 0 - if not hasattr(self, 'fp'): + if not hasattr(self, "fp"): self.fp = 0 - if not hasattr(self, 'fn'): + if not hasattr(self, "fn"): self.fn = 0 # Update counters based on predictions and labels - if pred[0] == "Malicious" and original_label == "Malicious": + if ( + pred[0] == "Malicious" + and original_label == "Malicious" + ): self.tp += 1 - elif pred[0] == "Benign" and original_label == "Benign": + elif ( + pred[0] == "Benign" and original_label == "Benign" + ): self.tn += 1 - elif pred[0] == "Malicious" and original_label == "Benign": + elif ( + pred[0] == "Malicious" + and original_label == "Benign" + ): self.fp += 1 - self.write_to_log(f"False Positive Flow: {self.flow}") - elif pred[0] == "Benign" and original_label == "Malicious": + self.write_to_log( + f"False Positive Flow: {self.flow}" + ) + elif ( + pred[0] == "Benign" + and original_label == "Malicious" + ): self.fn += 1 - self.write_to_log(f"False Negative Flow: {self.flow}") + self.write_to_log( + f"False Negative Flow: {self.flow}" + ) # Log the testing performance metrics - self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}") \ No newline at end of file + self.write_to_log( + f"TP: {self.tp}, TN: {self.tn}," + f" FP: {self.fp}, FN: {self.fn}" + ) diff --git a/modules/riskiq/riskiq.py b/modules/riskiq/riskiq.py index 5abf2ddb19..7b5653997e 100644 --- a/modules/riskiq/riskiq.py +++ b/modules/riskiq/riskiq.py @@ -25,7 +25,7 @@ def init(self): def read_configuration(self): conf = ConfigParser() - risk_iq_credentials_path = conf.RiskIQ_credentials_path() + risk_iq_credentials_path = conf.risk_iq_credentials_path() try: with open(risk_iq_credentials_path, "r") as f: self.riskiq_email = f.readline().replace("\n", "") diff --git a/modules/update_manager/update_manager.py b/modules/update_manager/update_manager.py index c6bf0013eb..2de0abf8eb 100644 --- a/modules/update_manager/update_manager.py +++ b/modules/update_manager/update_manager.py @@ -119,7 +119,7 @@ def read_riskiq_creds(risk_iq_credentials_path): self.ssl_feeds_path = conf.ssl_feeds() self.ssl_feeds = self.get_feed_details(self.ssl_feeds_path) - risk_iq_credentials_path = conf.RiskIQ_credentials_path() + risk_iq_credentials_path = conf.risk_iq_credentials_path() read_riskiq_creds(risk_iq_credentials_path) self.riskiq_update_period = conf.riskiq_update_period() diff --git a/slips_files/common/parsers/config_parser.py b/slips_files/common/parsers/config_parser.py index 40f1b044bc..e208f78816 100644 --- a/slips_files/common/parsers/config_parser.py +++ b/slips_files/common/parsers/config_parser.py @@ -418,7 +418,12 @@ def data_exfiltration_threshold(self): def get_ml_mode(self): return self.read_configuration("flowmldetection", "mode", "test") - def RiskIQ_credentials_path(self): + def create_performance_metrics_log_files(self) -> bool: + return self.read_configuration( + "flowmldetection", "create_performance_metrics_log_files", False + ) + + def risk_iq_credentials_path(self): return self.read_configuration( "threatintelligence", "RiskIQ_credentials_path", "" ) From 65206b61a2009dfebd8bdc938ffe0a23fd90c943 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:23:58 +0300 Subject: [PATCH 230/455] dont create an empty logfile when create_performance_metrics_log_files is set to false --- modules/flowmldetection/flowmldetection.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2a515d0cfa..9305197d3e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -78,6 +78,9 @@ def init_log_file(self): """ Init the log file for training or testing """ + if not self.enable_logs: + return + if self.mode == "train": # Initialize the training log file self.log_path = "./modules/flowmldetection/training.log" From cdbf9d386f4c4063bbf237e952bbadafef307d7f Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:29:30 +0300 Subject: [PATCH 231/455] when enabled, create testing.log or training.log in the current output dir --- modules/flowmldetection/flowmldetection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9305197d3e..f618195bce 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: GPL-2.0-only import numpy +import os from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -83,10 +84,10 @@ def init_log_file(self): if self.mode == "train": # Initialize the training log file - self.log_path = "./modules/flowmldetection/training.log" + self.log_path = os.path.join(self.output_dir, "training.log") elif self.mode == "test": # Initialize the testing log file - self.log_path = "./modules/flowmldetection/testing.log" + self.log_path = os.path.join(self.output_dir, "testing.log") self.log_file = open(self.log_path, "w") def read_configuration(self): From 68e588ab828fbd2da1b0251e3e3c2fd00f736796 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:43:32 +0300 Subject: [PATCH 232/455] Add an enum called labels with either Benign or Malicious so the labels are unified. --- modules/flowmldetection/flowmldetection.py | 65 +++++++++++----------- slips_files/core/structures/labels.py | 11 ++++ 2 files changed, 43 insertions(+), 33 deletions(-) create mode 100644 slips_files/core/structures/labels.py diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f618195bce..e828058ee4 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -19,11 +19,10 @@ matthews_corrcoef, recall_score, ) - - from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.labels import Label from slips_files.core.structures.evidence import ( Evidence, ProfileID, @@ -45,6 +44,10 @@ def warn(*args, **kwargs): warnings.warn = warn +BACKGROUND = Label.BACKGROUND.name +BENIGN = Label.BENIGN.name +MALICIOUS = Label.MALICIOUS.name + class FlowMLDetection(IModule): # Name: short name of the module. Do not use spaces @@ -132,9 +135,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): # Count the number of labels of each type in this epoc epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), + BACKGROUND: (y_flow == BACKGROUND).sum(), + MALICIOUS: (y_flow == MALICIOUS).sum(), + BENIGN: (y_flow == BENIGN).sum(), } # Train @@ -143,7 +146,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.clf.partial_fit( X_flow, y_flow, - classes=["Background", "Malicious", "Benign"], + classes=[BACKGROUND, MALICIOUS, BENIGN], ) except Exception: self.print("Error while calling clf.train()") @@ -153,13 +156,13 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): y_pred = self.clf.predict(X_flow) # For metrics, let's focus on Malicious vs Benign (ignore Background) - mask = (y_flow == "Malicious") | (y_flow == "Benign") + mask = (y_flow == MALICIOUS) | (y_flow == BENIGN) y_true_bin = y_flow[mask] y_pred_bin = y_pred[mask] # Map to binary: Malicious=1, Benign=0 - y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) - y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + y_true_bin = numpy.where(y_true_bin == MALICIOUS, 1, 0) + y_pred_bin = numpy.where(y_pred_bin == MALICIOUS, 1, 0) # Compute confusion matrix: tn, fp, fn, tp tn, fp, fn, tp = ( @@ -190,9 +193,12 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.write_to_log( f"Total labels: {sum_labeled_flows}, " f"Background: {epoch_label_counts['Background']}. " - f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " - f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " - f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + f"Benign: {epoch_label_counts['Benign']}. " + f"Malicious: {epoch_label_counts[MALICIOUS]}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, " + f"TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, " + f"Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." ) except Exception: self.print("Error in train().", 0, 1) @@ -345,9 +351,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "ground_truth_label": "Malicious", + "ground_truth_label": MALICIOUS, "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": MALICIOUS }, } ) @@ -366,9 +372,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "ground_truth_label": "Benign", + "ground_truth_label": BENIGN, "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": BENIGN }, } ) @@ -421,7 +427,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # For argus binetflows this fails because ther is a field calle + # bytes that was not in other flows. It should be called allbytes. # Error """ [Flow ML Detection] Error in detect() while processing dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes @@ -546,8 +553,8 @@ def main(self): self.twid = msg["twid"] self.profileid = msg["profileid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them + # These following extra fields are expected in testing. + # update the original flow dict to have them self.flow.update( { "state": msg["interpreted_state"], @@ -612,7 +619,7 @@ def main(self): # an error occurred return - if pred[0] == "Malicious": + if pred[0] == MALICIOUS: # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( @@ -642,26 +649,18 @@ def main(self): # Update counters based on predictions and labels if ( - pred[0] == "Malicious" - and original_label == "Malicious" + pred[0] == MALICIOUS + and original_label == MALICIOUS ): self.tp += 1 - elif ( - pred[0] == "Benign" and original_label == "Benign" - ): + elif pred[0] == BENIGN and original_label == BENIGN: self.tn += 1 - elif ( - pred[0] == "Malicious" - and original_label == "Benign" - ): + elif pred[0] == MALICIOUS and original_label == BENIGN: self.fp += 1 self.write_to_log( f"False Positive Flow: {self.flow}" ) - elif ( - pred[0] == "Benign" - and original_label == "Malicious" - ): + elif pred[0] == BENIGN and original_label == MALICIOUS: self.fn += 1 self.write_to_log( f"False Negative Flow: {self.flow}" diff --git a/slips_files/core/structures/labels.py b/slips_files/core/structures/labels.py new file mode 100644 index 0000000000..b1dc64234e --- /dev/null +++ b/slips_files/core/structures/labels.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class Label(Enum): + """ + label of flows should be one of the following + """ + + MALICIOUS = "Malicious" + BENIGN = "Benign" + BACKGROUND = "Background" From 705f63d56c98f536e52a1b0cd0c02836c14aa4b4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 233/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e44ac83f4d..16b67e9038 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -120,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -132,7 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -155,15 +291,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From b690ea70e919e7ca95227684396e811a349dd771 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 234/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 00415c7c2bdf9900eee91682602db8ff609ec19d Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 235/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From f2de4e978cc9755565a87f168ee6d7c2cbd4abba Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 236/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 16b67e9038..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 0b805976df..3a7f783ea7 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From bfc1221692fc0d0e8d72ad157f2eeff254706cc5 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 237/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e9c16da10372297e2c4258b11dd94f02475c6f2d Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 238/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From ff289cbf8018779acd8a4ab08a8448223e5a24b8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 239/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 31f5e9c653792a09dfe8ce215e1f57b0b2e71e59 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 240/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 777c76da4098c59526bbce25139ed973129a8460 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 241/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 8c7df7c47300cc7f1507a71b98d3252cb10dcb4e Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 242/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 25d09337a3341a8831684f00875d9e32bba520c4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 243/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e140a0c122398fc669668f26ae5d808d9ea662a8 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 244/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 104379e99f054bc8b99813a428c62b05c7b6181a Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 245/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 22244a7ec594088f70514e5efef966d20732d064 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 246/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From f06b6a3ff035031735ec93a106d4ea0a4315d50e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 247/455] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 3a7f783ea7..0b805976df 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 9e0355a012f073928a7edcb388701a0e7e26748c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 248/455] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..c06755a599 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From c98a3cd4ea7da549834fee1a3d5d34c33f068266 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 249/455] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115bd..0000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 1a133431aba6f1a40e525206cc3ea14749136ffd Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 250/455] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c06755a599..87e07c7592 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From b7af797fc757d7e3cbfc2317edc7381e5ee1e203 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 251/455] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 87e07c7592..e91495d649 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", @@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -437,18 +441,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -461,23 +463,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -497,8 +507,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -506,9 +516,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 3faff9b5bd3aeb53c306324572e39e743f43272d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 252/455] Fix the profiler handler for cases of nan in state --- .../core/database/redis_db/profile_handler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index edbbf3a12f..4d91b43a98 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -395,7 +395,12 @@ def get_final_state_from_flags(self, state, pkts): We receive the pakets to distinguish some Reset connections """ try: - pre = state.split("_")[0] + # In some flows the state is a nan + try: + pre = state.split("_")[0] + except AttributeError: + pre = '' + try: # Try suricata states """ @@ -417,7 +422,11 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - suf = state.split("_")[1] + # In some flows the state is a nan + try: + suf = state.split("_")[1] + except AttributeError: + suf = '' if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -518,7 +527,7 @@ def get_final_state_from_flags(self, state, pkts): except Exception: exception_line = sys.exc_info()[2].tb_lineno self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", + f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}", 0, 1, ) From 2e0603b2c8e0adb327bf5249a30d2894a7d02adb Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:20:42 +0100 Subject: [PATCH 253/455] slips.yaml. Update to have correct labels. By default test. Defaul training lbel is benign --- config/slips.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index 02adc7f1b4..1b73e7b549 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -105,13 +105,12 @@ parameters: deletePrevdb: true # Set the label for all the flows that are being read. - # For now only normal and malware directly. No option for setting labels - # with a filter + # For now only Benign and Malicious (Capitalized) # The purpose is to be used in the training of ML models and to output # flows with labels for other tools. - # label: malicious - # label: unknown - label: normal + # label: Malicious + # label: Benign + label: Benign # If Zeek files are rotated or not to avoid running out of disk. # Zeek rotation is enabled by default when using an interface, # which means Slips will delete all Zeek log files after 1 day From 6f2e3c3be24352300ad435be5734a92cb917ab52 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:21:21 +0100 Subject: [PATCH 254/455] First ipython to tst ML flow related models --- modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb new file mode 100644 index 0000000000..d726cd2805 --- /dev/null +++ b/modules/flowmldetection/flowmlanalysis.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of Flows with Machine Learning for Slips" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis of a fixed list of flows to try techniques and find parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "import pickle\n", + "import pandas as pd\n", + "import json\n", + "import traceback\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "slips-new", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 9a91a801f64855f3d9dbb64a013160e7ebc97d2d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 255/455] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e91495d649..58b4ce1e4c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From b7c55c1fb89e829950ff3f1e4075135f92eb0f8d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 256/455] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 58b4ce1e4c..4a4d46e376 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From 1336ced589060f2382bfdcc41b883aab7cff2530 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:26:42 +0100 Subject: [PATCH 257/455] profile_handler. Small bug in how we handled the profiles, we were using 'in' instead of == for established. Some not established MAY not have been correctly captured --- slips_files/core/database/redis_db/profile_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 4d91b43a98..a6669c92a9 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -409,9 +409,10 @@ def get_final_state_from_flags(self, state, pkts): these are: New, Established and Closed,for UDP only new and established. For each of these states Suricata can employ different timeouts. """ - if "new" in state or "established" in state: + # This is controversial, but if we dont have a good state, we consider it not established for now + if "new" in state or state.lower() == "established": return "Established" - elif "closed" in state: + elif "closed" in state or state.lower() == 'not established': return "Not Established" # We have varius type of states depending on the type of flow. @@ -422,7 +423,6 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From 9dc77cd61c1b6431af32903d5003111405945ff3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 258/455] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1073 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 411 zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice! z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWpQXRJ|Emr(~!@tY-9P0&;YE zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1 z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8 zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5 delta 380 zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN zW*@ diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch literal 890 zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD< z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*|B)5DrlQdy7+(!%6#F{QHuBFo&v zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~ zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E; z8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_ delta 525 zcmeyxHj9S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5mhSAF{H=RwvmLf> z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%YfQ^>gH&y>a4SnPazubB5!cqc1YnCGT))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4d&dkpP QMw@}2k%^wsl+q+U0P2|NZU6uP From 12e3d93823589e3314325b158b8becc66e8d5d21 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 259/455] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 260/455] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4a4d46e376..d8e9ada27c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 83a9128ed9c44fbf9d55c05523a627a97bd60766 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 261/455] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 262/455] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 71b93a508e1d8d625fb51ae4a698360044f2af34 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 263/455] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 2c70aa760e24cc16268efd553a3f94747b12a15e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 264/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 317 +++++++++++++-------- 1 file changed, 206 insertions(+), 111 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d8e9ada27c..8917fef6a5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -55,12 +56,8 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained - self.last_number_of_flows_when_trained = 0 + # Minum amount of new lables needed to trigger the train + self.minimum_lables_to_retrain = 50 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -71,25 +68,26 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() - self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) + # Process the labels to have only Normal and Malware + self.flows.label = self.flows.label.str.replace( + r"(^.*ormal.*$)", "Normal", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alware.*$)", "Malware", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alicious.*$)", "Malware", regex=True + ) - # Create X_flow with the current flows minus the label + # Separate + y_flow = self.flows["label"] X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) - # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -98,7 +96,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Malware", "Normal"] ) except Exception: self.print("Error while calling clf.train()") @@ -121,7 +119,142 @@ def train(self): self.store_model() except Exception: - self.print("Error in train().", 0, 1) + self.print("Error in train()", 0, 1) + self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -135,11 +268,6 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - # For now, discard these to_drop = [ "appproto", @@ -152,7 +280,9 @@ def process_features(self, dataset): "history", "uid", "dir_", + "dbytes", "endtime", + "bytes", "flow_source", ] for field in to_drop: @@ -161,16 +291,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -204,11 +330,7 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ + fields_to_convert_to_flow = [ dataset.proto, dataset.dport, dataset.sport, @@ -219,10 +341,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_float: + for field in fields_to_convert_to_flow: try: field = field.astype("float64") - except (ValueError, AttributeError): + except ValueError: pass return dataset @@ -231,9 +353,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_flows(self): """ - Process all the flows in the DB + Process all the flwos in the DB Store the pandas df in self.flows """ try: @@ -249,48 +371,44 @@ def process_training_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. + # self.print(f'Amount of labeled flows: {labels}', 0, 1) flows.append( { - "starttime": 1594417039.029793, + "ts": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, + "state": "Established", + "allbytes": 42764, + "spkts": 37, "sbytes": 25517, - "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "label": "Malware", "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": "Malware" }, } ) flows.append( { - "starttime": 1382355032.706468, + "ts": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "SF", + "state": "Established", + "allbytes": 67696, "spkts": 1, - "dpkts": 0, "sbytes": 100, - "dbytes": 67596, "appproto": "http", - "label": "Benign", + "label": "Normal", "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": "Normal" }, } ) @@ -318,8 +436,6 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) - if dflow.empty: - return None # Update the flow to the processed version return dflow except Exception: @@ -333,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: + given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -340,28 +457,12 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", + "dbytes", + "dpkts", "endtime", + "bytes", "flow_source", - "ground_truth_label", # todo now we can use them - "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -373,7 +474,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" + f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -465,16 +566,18 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - # When a new flow arrives msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] + twid = msg["twid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original + # these fields are expected in testing. update the original # flow dict to have them self.flow.update( { + "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), + # the flow["state"] is the origstate, we dont need that here + # we need the interpreted state "state": msg["interpreted_state"], + "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -487,31 +590,23 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_labels_to_start_train + sum_labeled_flows >= self.minimum_lables_to_retrain + and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows() - # Train an algorithm - self.train() - self.last_number_of_flows_when_trained = sum_labeled_flows - + # We get here every 'self.minimum_lables_to_retrain' + # amount of labels + # So for example we retrain every 100 labels and only when + # we have at least 100 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_flows() + # Train an algorithm + self.train() elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -531,8 +626,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' + f"Report Prediction {pred[0]} for label" + f' {label} flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -540,9 +635,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malicious": + if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) + self.set_evidence_malicious_flow(self.flow, twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From e04e6c61fe8584afe0247f8b21fe2b865cdafe71 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 265/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 8a30e90ccdcecc165d280d5f47bde3d370fabe00 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 266/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 3c7af271be30bc4b2a1f8fdf466941f9bfa5b5a9 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 267/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 8917fef6a5..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 0b805976df..3a7f783ea7 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 561049fd9988c8435cff5ac5027e3602c2409088 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 268/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..12c3589edc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From da9a6b009a0cf1899f2739b9061558ff730ca3b6 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 269/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589edc..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 48b4255302ec79e0d4a9e675b42f08721411e34d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 270/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..12c3589edc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5be432f747eccfde0a25cf4d9f97cf6996fff206 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 271/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589edc..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 43f078f96a223cb031b6973dc4c0f4dcb34ac76b Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 272/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..12c3589edc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 6be1da4f70112a4bf1a49010dfbbf0123e2936bd Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 273/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589edc..fb17b57f23 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 4c52dd2a3fff6acfaa6e4c51593818fbedf73a39 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 274/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 163 +++++++++++++++++++-- 1 file changed, 149 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f23..c8226368c7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -8,6 +8,7 @@ import pickle import pandas as pd import json +import datetime import traceback import warnings import sys @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -156,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -393,21 +524,25 @@ def read_model(self): def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 description = ( - f"Flow with malicious characteristics by ML. Src IP" + f"Malicious flow by ML. Src IP" f" {flow['saddr']}:{flow['sport']} to " f"{flow['daddr']}:{flow['dport']}" ) + + timestamp = utils.convert_format( + datetime.datetime.now(), utils.alerts_format + ) twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( direction=Direction.SRC, - ioc_type=IoCType.IP, + attacker_type=IoCType.IP, value=flow["saddr"], ), victim=Victim( direction=Direction.DST, - ioc_type=IoCType.IP, + victim_type=IoCType.IP, value=flow["daddr"], ), threat_level=ThreatLevel.LOW, @@ -416,7 +551,7 @@ def set_evidence_malicious_flow(self, flow: dict, twid: str): profile=ProfileID(ip=flow["saddr"]), timewindow=TimeWindow(twid_number), uid=[flow["uid"]], - timestamp=flow["starttime"], + timestamp=timestamp, method=Method.AI, src_port=flow["sport"], dst_port=flow["dport"], From 0b646faa189b0097648fb7283e91121aa211f19f Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 275/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115bd..b671a09a28 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From a477d089a3d8dd0391bb34de0261d7dafe23af2a Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 276/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From a74d1c5c6fc38842a6a3143ba91e8aae0c4c8599 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 277/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c7..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 560a37b8ef1724010ec2f653ab6e686efbfe9fdb Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 278/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..94eb27afdf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5190917ba7031d744def42bf9d0d1510a59746cc Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 279/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 94eb27afdf..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 567f4393ad7832b554e8684c026fad71fe6d0b3e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 280/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 150 +++++++++++++++++++-- 1 file changed, 140 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..c8226368c7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -157,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 626a5c3d5bb9f9cb94d5b1d91f4c61c4913247a1 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 281/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115bd..b671a09a28 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 2c2212290619b7bccb25ef045f3a2ba3f4f5a270 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 282/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 6bed5ff1a0bef41b33a1cd5b07dcf89cb2a43ab6 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 283/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c7..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From d5ea6803c87520eee8061d06dfce7a75159238b3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 284/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 438 +++++++++++++-------- 1 file changed, 278 insertions(+), 160 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..124ec61f91 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,8 +1,3 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -10,13 +5,10 @@ import json import datetime import traceback -import warnings import sys -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( +from slips_files.common.imports import * +from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, TimeWindow, @@ -25,8 +17,7 @@ EvidenceType, IoCType, Direction, - Victim, - Method, + IDEACategory, ) # Only for debbuging @@ -38,6 +29,8 @@ def warn(*args, **kwargs): pass +import warnings + warnings.warn = warn @@ -63,8 +56,6 @@ def init(self): # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() @@ -122,6 +113,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -130,7 +256,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -139,35 +265,28 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "starttime", + "ts", + "origstate", "type_", - "smac", - "dmac", - "history", - "uid", "dir_", + "history", "dbytes", - "endtime", - "bytes", - "flow_source", + "dpkts", + "smac", + "dmac", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): + except ValueError: pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -201,23 +320,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_flow: - try: - field = field.astype("float64") - except ValueError: - pass - + dataset.proto = dataset.proto.astype("float64") + try: + # Convert dport to float + dataset.dport = dataset.dport.astype("float") + except ValueError: + pass + try: + # Convert sport to float + dataset.sport = dataset.sport.astype("float") + except ValueError: + pass + try: + # Convert Dur to float + dataset.dur = dataset.dur.astype("float") + except ValueError: + pass + try: + # Convert TotPkts to float + dataset.pkts = dataset.pkts.astype("float") + except ValueError: + pass + try: + # Convert SrcPkts to float + dataset.spkts = dataset.spkts.astype("float") + except ValueError: + pass + try: + # Convert TotBytes to float + dataset.allbytes = dataset.allbytes.astype("float") + except ValueError: + pass + try: + # Convert SrcBytes to float + dataset.sbytes = dataset.sbytes.astype("float") + except ValueError: + pass return dataset except Exception: # Stop the timer @@ -233,6 +371,7 @@ def process_flows(self): # We get all the flows so far # because this retraining happens in batches flows = self.db.get_all_flows() + # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -252,7 +391,9 @@ def process_flows(self): "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 84, "allbytes": 42764, "spkts": 37, "sbytes": 25517, @@ -272,7 +413,9 @@ def process_flows(self): "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 67, "allbytes": 67696, "spkts": 1, "sbytes": 100, @@ -298,55 +441,42 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self, flow_to_process: dict): + def process_flow(self): """ Process one flow. Only used during detection in testing - returns the pandas df with the processed flow + Store the pandas df in self.flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) + raw_flow = pd.DataFrame(self.flow_dict, index=[0]) + # Process features dflow = self.process_features(raw_flow) # Update the flow to the processed version - return dflow + self.flow = dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self, x_flow) -> Optional[numpy.ndarray]: + def detect(self): """ - Detects the given flow with the current model stored - and returns the predection array + Detect this flow with the current model stored """ try: - given_x_flow = x_flow - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "dbytes", - "dpkts", - "endtime", - "bytes", - "flow_source", - ] - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass + # Store the real label if there is one + y_flow = self.flow["label"] + # remove the real label column + self.flow = self.flow.drop("label", axis=1) + # remove the label predictions column of the other modules + X_flow = self.flow.drop("module_labels", axis=1) # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) + X_flow = self.scaler.transform(X_flow) + pred = self.clf.predict(X_flow) return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" - ) + except Exception: + # Stop the timer + self.print("Error in detect() X_flow:") + self.print(X_flow) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -354,10 +484,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: + with open("./modules/flowmldetection/model.bin", "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open(self.scaler_path, "wb") as g: + with open("./modules/flowmldetection/scaler.bin", "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -367,23 +497,20 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: + with open("./modules/flowmldetection/model.bin", "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: + with open("./modules/flowmldetection/scaler.bin", "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) + self.print("There was no model. Creating a new empty model.", 0, 2) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. " - "Creating a new empty model.", + "Error reading model from disk. Creating a new empty model.", 0, 2, ) @@ -391,40 +518,39 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow(self, flow: dict, twid: str): + def set_evidence_malicious_flow( + self, + saddr: str, + sport: str, + daddr: str, + dport: str, + twid: str, + uid: str, + ): confidence: float = 0.1 + ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" + f"Malicious flow by ML. Src IP {saddr}:{sport} to " + f"{daddr}:{dport} {ip_identification}" ) timestamp = utils.convert_format( datetime.datetime.now(), utils.alerts_format ) - twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, - attacker_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - victim_type=IoCType.IP, - value=flow["daddr"], + direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], + profile=ProfileID(ip=saddr), + timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), + uid=[uid], timestamp=timestamp, - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], + category=IDEACategory.ANOMALY_TRAFFIC, ) self.db.set_evidence(evidence) @@ -441,22 +567,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - msg = json.loads(msg["data"]) - twid = msg["twid"] - self.flow = msg["flow"] - # these fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state - "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) + data = msg["data"] + # Convert from json to dict + data = json.loads(data) + profileid = data["profileid"] + twid = data["twid"] + # Get flow that is now in json format + flow = data["flow"] + # Convert flow to a dict + flow = json.loads(flow) + # Convert the common fields to something that can + # be interpreted + # Get the uid which is the key + uid = next(iter(flow)) + self.flow_dict = json.loads(flow[uid]) if self.mode == "train": # We are training @@ -469,57 +593,51 @@ def main(self): sum_labeled_flows >= self.minimum_lables_to_retrain and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels + # We get here every 'self.minimum_lables_to_retrain' amount of labels + # So for example we retrain every 100 labels and only when we have at least 100 labels self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." + f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." ) - # Process all flows in the DB and make them ready - # for pandas + # Process all flows in the DB and make them ready for pandas self.process_flows() # Train an algorithm self.train() elif self.mode == "test": # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) + self.process_flow() - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: + # After processing the flow, it may happen that we delete icmp/arp/etc + # so the dataframe can be empty + if self.flow is not None and not self.flow.empty: # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return + pred = self.detect() + label = self.flow_dict["label"] - label = self.flow["label"] + # Report if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode + # If the user specified a label in test mode, and the label + # is diff from the prediction, print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 3, ) if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow( + self.flow_dict["saddr"], + self.flow_dict["sport"], + self.flow_dict["daddr"], + self.flow_dict["dport"], + twid, + uid, + ) self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 2, ) From 0e07e32ecc9922fb33f034bf05c3f8888b0938ab Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 285/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115bd..b671a09a28 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 000e8926166c4c4f4af17b8cf157bf2d37472950 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 286/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 0955f66abeb7f5e0f97459abc63d276730ab6868 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 287/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 169 +++------------------ 1 file changed, 19 insertions(+), 150 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 124ec61f91..c57a7a3581 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,9 +5,13 @@ import json import datetime import traceback -import sys +import warnings + -from slips_files.common.imports import * +from slips_files.common.state_handler import get_final_state_from_flags +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, @@ -29,8 +33,6 @@ def warn(*args, **kwargs): pass -import warnings - warnings.warn = warn @@ -113,141 +115,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -281,12 +148,17 @@ def process_features(self, dataset): except ValueError: pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -370,7 +242,7 @@ def process_flows(self): try: # We get all the flows so far # because this retraining happens in batches - flows = self.db.get_all_flows() + flows: list = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware @@ -464,7 +336,7 @@ def detect(self): """ try: # Store the real label if there is one - y_flow = self.flow["label"] + # y_flow = self.flow["label"] # remove the real label column self.flow = self.flow.drop("label", axis=1) # remove the label predictions column of the other modules @@ -568,13 +440,10 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): data = msg["data"] - # Convert from json to dict data = json.loads(data) - profileid = data["profileid"] + # profileid = data["profileid"] twid = data["twid"] - # Get flow that is now in json format flow = data["flow"] - # Convert flow to a dict flow = json.loads(flow) # Convert the common fields to something that can # be interpreted From 088d9270622d332b34eb39fe23d1e540257188b6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:36:55 +0200 Subject: [PATCH 288/455] mlflow. Ignore UID column --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c57a7a3581..e2aa1e0ee3 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -141,6 +141,7 @@ def process_features(self, dataset): "dpkts", "smac", "dmac", + "uid", ] for field in to_drop: try: From 51f5f2f76934d8add93b8ec09190317d421cdc93 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 289/455] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 3a7f783ea7..0b805976df 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 38c5d55481cc57d81ccba540ffbb2d4811c39e6d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 290/455] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115bd..0000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From c15b430c419997b224a9ef1b4d5a8cd99195d0b8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 291/455] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e2aa1e0ee3..9269b67012 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -154,7 +154,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From dc2ced3b23a3dac2e11b8d71a3d3bb236d7a7703 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 292/455] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 378 +++++++++++---------- 1 file changed, 197 insertions(+), 181 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9269b67012..e6ea0b5171 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,18 +1,20 @@ +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle import pandas as pd import json -import datetime import traceback import warnings - -from slips_files.common.state_handler import get_final_state_from_flags from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule -from slips_files.core.evidence_structure.evidence import ( +from slips_files.core.structures.evidence import ( Evidence, ProfileID, TimeWindow, @@ -21,7 +23,8 @@ EvidenceType, IoCType, Direction, - IDEACategory, + Victim, + Method, ) # Only for debbuging @@ -52,36 +55,41 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -90,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -113,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -123,7 +131,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -132,21 +140,20 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "ts", - "origstate", + "starttime", "type_", - "dir_", - "history", - "dbytes", - "dpkts", "smac", "dmac", + "history", "uid", + "dir_", + "endtime", + "flow_source", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except ValueError: + except (ValueError, KeyError): pass # When flows are read from Slips sqlite, @@ -155,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -193,58 +199,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - dataset.proto = dataset.proto.astype("float64") - try: - # Convert dport to float - dataset.dport = dataset.dport.astype("float") - except ValueError: - pass - try: - # Convert sport to float - dataset.sport = dataset.sport.astype("float") - except ValueError: - pass - try: - # Convert Dur to float - dataset.dur = dataset.dur.astype("float") - except ValueError: - pass - try: - # Convert TotPkts to float - dataset.pkts = dataset.pkts.astype("float") - except ValueError: - pass - try: - # Convert SrcPkts to float - dataset.spkts = dataset.spkts.astype("float") - except ValueError: - pass - try: - # Convert TotBytes to float - dataset.allbytes = dataset.allbytes.astype("float") - except ValueError: - pass - try: - # Convert SrcBytes to float - dataset.sbytes = dataset.sbytes.astype("float") - except ValueError: - pass + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + return dataset except Exception: # Stop the timer self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: # We get all the flows so far # because this retraining happens in batches - flows: list = self.db.get_all_flows() - + flows = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -254,48 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 84, - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 67, - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -314,42 +304,51 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self): + def process_flow(self, flow_to_process: dict): """ Process one flow. Only used during detection in testing - Store the pandas df in self.flow + returns the pandas df with the processed flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(self.flow_dict, index=[0]) - # Process features + raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) # Update the flow to the processed version - self.flow = dflow + return dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self): + def detect(self, x_flow) -> Optional[numpy.ndarray]: """ - Detect this flow with the current model stored + Detects the given flow with the current model stored + and returns the predection array """ try: - # Store the real label if there is one - # y_flow = self.flow["label"] - # remove the real label column - self.flow = self.flow.drop("label", axis=1) - # remove the label predictions column of the other modules - X_flow = self.flow.drop("module_labels", axis=1) + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + ] + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass # Scale the flow - X_flow = self.scaler.transform(X_flow) - pred = self.clf.predict(X_flow) + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) return pred - except Exception: - # Stop the timer - self.print("Error in detect() X_flow:") - self.print(X_flow) + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -357,10 +356,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "wb") as f: + with open(self.model_path, "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open("./modules/flowmldetection/scaler.bin", "wb") as g: + with open(self.scaler_path, "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -370,20 +369,23 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "rb") as f: + with open(self.model_path, "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open("./modules/flowmldetection/scaler.bin", "rb") as g: + with open(self.scaler_path, "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print("There was no model. Creating a new empty model.", 0, 2) + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. Creating a new empty model.", + "Error reading model from disk. " + "Creating a new empty model.", 0, 2, ) @@ -391,39 +393,36 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow( - self, - saddr: str, - sport: str, - daddr: str, - dport: str, - twid: str, - uid: str, - ): + def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 - ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP {saddr}:{sport} to " - f"{daddr}:{dport} {ip_identification}" - ) - - timestamp = utils.convert_format( - datetime.datetime.now(), utils.alerts_format + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" ) - + twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=saddr), - timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), - uid=[uid], - timestamp=timestamp, - category=IDEACategory.ANOMALY_TRAFFIC, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], ) self.db.set_evidence(evidence) @@ -440,17 +439,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - data = msg["data"] - data = json.loads(data) - # profileid = data["profileid"] - twid = data["twid"] - flow = data["flow"] - flow = json.loads(flow) - # Convert the common fields to something that can - # be interpreted - # Get the uid which is the key - uid = next(iter(flow)) - self.flow_dict = json.loads(flow[uid]) + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) if self.mode == "train": # We are training @@ -459,55 +461,69 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' amount of labels - # So for example we retrain every 100 labels and only when we have at least 100 labels - self.print( - f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect - self.process_flow() + processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we delete icmp/arp/etc - # so the dataframe can be empty - if self.flow is not None and not self.flow.empty: + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: # Predict - pred = self.detect() - label = self.flow_dict["label"] + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return - # Report + label = self.flow["label"] if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, and the label - # is diff from the prediction, print in debug mode + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode self.print( - f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow( - self.flow_dict["saddr"], - self.flow_dict["sport"], - self.flow_dict["daddr"], - self.flow_dict["dport"], - twid, - uid, - ) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 2, ) From 76ae27f6a3389245e3fd6365f6176415ae1d7b61 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 293/455] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e6ea0b5171..0fa1e4d767 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From e216d5bce7de6261f5b9f4cf99d5a6212d79338d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 294/455] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0fa1e4d767..5c5f9943f1 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From 90e2344f104ac3bc43ad17e6c18151b7939764e2 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 295/455] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1124 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 130 zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KL^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8 z4q_05T(v)g91VHfmFeIMvRKFpJJ~89v lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl# diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch delta 290 zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGcfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXo Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 296/455] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 297/455] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5c5f9943f1..fe950ed4bb 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "endtime", "flow_source", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From b57b591133d2579418191ead001227c27d258432 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 298/455] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 299/455] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 259169c206001f6495880b8fcc942fd7b87878e9 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 300/455] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 0789af56c5c7b8d00382002ef30f5b5d30e9a92f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:49:23 +0000 Subject: [PATCH 301/455] Add plot for flowml train scores --- modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py new file mode 100644 index 0000000000..0b5b5b72ba --- /dev/null +++ b/modules/flowmldetection/plot_train_score.py @@ -0,0 +1,56 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re +import sys + +def plot_log_data(file_path): + # Read the log data from the file + with open(file_path, 'r') as file: + log_data = file.read() + + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + + # Parse the log file + data = re.findall(pattern, log_data) + + # Convert data to a DataFrame + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + df = df.astype({ + "Background": int, + "Benign": int, + "Malicious": int, + "Total labels": float, + "Score": float + }) + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plotting Score on the left y-axis + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + ax1.set_xlabel('Index') + ax1.set_ylabel('Score', color='tab:blue') + ax1.tick_params(axis='y', labelcolor='tab:blue') + + # Create the second y-axis for the Total labels + ax2 = ax1.twinx() + ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') + ax2.set_ylabel('Total labels', color='tab:red') + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Adding title and legend + plt.title('Log Data Visualization') + fig.tight_layout() + + # Save plot to a PNG file + plt.savefig('log_data_plot_with_two_scales.png') + + # Display the plot + plt.show() + +# Make sure the file path is passed as an argument +if len(sys.argv) < 2: + print("Please provide the path to the log file as a parameter.") +else: + plot_log_data(sys.argv[1]) From 6c4e7f16e84bc7d501031d7209fc3975087ef1c3 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:04 +0000 Subject: [PATCH 302/455] Add a log file to store the training data output --- modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fe950ed4bb..60217ada28 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -68,12 +68,29 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves self.label = conf.label() - def train(self): + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): """ Train a model based on the flows we receive and the labels """ From d1f4f4873e56c4a5ffea27e384d75a244c3dc717 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:32 +0000 Subject: [PATCH 303/455] Store data in the log file of training --- modules/flowmldetection/flowmldetection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 60217ada28..6f732da636 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -137,9 +137,13 @@ def train(self, sum_labeled_flows): # Store the models on disk self.store_model() + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ From 38347dcbcd0a5bd2f8f0313160d26aadb4d460aa Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:53 +0000 Subject: [PATCH 304/455] better comments --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6f732da636..ed3aecf1b0 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -59,10 +59,9 @@ def init(self): self.minimum_labels_to_start_train = 50 # Minum amount of new labels needed to retrain self.minimum_labels_to_retrain = 50 - # The number of flows when last trained + # The number of flows when last trained. Used internally only to know + # when to retrain self.last_number_of_flows_when_trained = 0 - # To plot the scores of training - # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" From b9ff8e3090942b37c032fb535a31d6518b22fae7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:30 +0000 Subject: [PATCH 305/455] Fix issue not dropping detailed labels --- modules/flowmldetection/flowmldetection.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index ed3aecf1b0..25b30cf515 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -94,23 +94,19 @@ def train(self, sum_labeled_flows): Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Train try: self.clf.partial_fit( From 8da38939309e7bc3cb878b4c4c20ae2dd8bb56e1 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:53 +0000 Subject: [PATCH 306/455] Fix issue that not all labels sere given to the partial fit --- modules/flowmldetection/flowmldetection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 25b30cf515..b2d0db5e51 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -109,8 +109,9 @@ def train(self, sum_labeled_flows): # Train try: + # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") From f1b5b683153abe35d4b28dbc03152bebfa4cb8a2 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:52:08 +0000 Subject: [PATCH 307/455] count partial labels in this epoch --- modules/flowmldetection/flowmldetection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b2d0db5e51..1146091a92 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -106,6 +106,12 @@ def train(self, sum_labeled_flows): # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } # Train try: From 84480185bdbd1eb9887b86fcc75a889e43f57964 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:09 +0000 Subject: [PATCH 308/455] Dont print training in screen --- modules/flowmldetection/flowmldetection.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 1146091a92..4bb2ad7dbf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -126,15 +126,8 @@ def train(self, sum_labeled_flows): # See score so far in training score = self.clf.score(X_flow, y_flow) - # To debug the training score - # self.scores.append(score) - - self.print(f" Training Score: {score}", 0, 1) - # self.print(f' Model Parameters: {self.clf.coef_}') - - # Debug code to store a plot in a png of the scores - # plt.plot(self.scores) - # plt.savefig('train-scores.png') + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) # Store the models on disk self.store_model() From 7c2b383edbda7283716ebc5b894fd5d8fc62f7da Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:28 +0000 Subject: [PATCH 309/455] Add function to write to train log --- modules/flowmldetection/flowmldetection.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4bb2ad7dbf..d4b2762f5f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -247,28 +247,28 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_training_flows(self, last_number_of_flows_when_trained): """ - Process all the flows in the DB + Process only the new flows in the DB since the last training. Store the pandas df in self.flows """ try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + # We get all the flows so far - # because this retraining happens in batches flows = self.db.get_all_flows() - # Check how many different labels are in the DB - # We need both normal and malware + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB labels = self.db.get_labels() if len(labels) == 1: - # Only 1 label has flows - # There are not enough different labels, so insert two flows - # that are fake but representative of a normal and malware flow - # they are only for the training process - # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. - flows.append( + # Insert fake flows for both classes if needed + new_flows.append( { "starttime": 1594417039.029793, "dur": "1.9424750804901123", From ad07f7c245eea515e4395b1216f3c564068067ae Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:27 +0000 Subject: [PATCH 310/455] Fix label in dummy flow --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d4b2762f5f..6a44422cc2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "ground_truth_label": "Malicious", "module_labels": { "flowalerts-long-connection": "Malicious" }, } ) - flows.append( + new_flows.append( { "starttime": 1382355032.706468, "dur": "10.896695", From d3736905508aa9dbcfbd7044532d0aed3501db5f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:39 +0000 Subject: [PATCH 311/455] Fix dummy flow --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6a44422cc2..20f1f8ca89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "label": "Benign", + "ground_truth_label": "Benign", "module_labels": { "flowalerts-long-connection": "Benign" }, From 867da84a20fb4c6b695906f94c9ba1b7b967d38d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:58:28 +0000 Subject: [PATCH 312/455] Rename variable --- modules/flowmldetection/flowmldetection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 20f1f8ca89..59064d61a5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): }, } ) - # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(flows) + df_flows = pd.DataFrame(new_flows) # Process features df_flows = self.process_features(df_flows) @@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: - # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) From aeebcbc24872621b69dd030456ccea86053e2948 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:32 +0000 Subject: [PATCH 313/455] Fix dummy flow label --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 59064d61a5..6b41b40298 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -356,6 +356,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error From 5fef371864f1faa6d45f5ad54813dd4b5354171f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:47 +0000 Subject: [PATCH 314/455] Pass values to train function --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6b41b40298..4d66aab855 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -521,9 +521,9 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows() + self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train() + self.train(sum_labeled_flows) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From 3d8f125ec27114c35e5c552cbbf7c1c5d3baadb4 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:01:47 +0000 Subject: [PATCH 315/455] import os --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4d66aab855..766178e127 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import os from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 260d6845ce3775c84f93cc6a79f04812c9ca50be Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:15 +0000 Subject: [PATCH 316/455] Get issue of total flows zero --- slips_files/core/database/database_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 0b805976df..b32c004a32 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -661,7 +661,8 @@ def add_software_to_profile(self, *args, **kwargs): return self.rdb.add_software_to_profile(*args, **kwargs) def get_total_flows(self, *args, **kwargs): - return int(self.rdb.get_total_flows(*args, **kwargs)) + total_flows = self.rdb.get_total_flows(*args, **kwargs) + return int(total_flows) if total_flows is not None else 0 def increment_processed_flows(self, *args, **kwargs): return self.rdb.increment_processed_flows(*args, **kwargs) From c65e8f15d3e641afe585428f2526c6f50117c791 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:32 +0000 Subject: [PATCH 317/455] Add comments --- slips_files/core/database/database_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index b32c004a32..1d339685f8 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -888,7 +888,10 @@ def get_flow(self, *args, **kwargs): """returns the raw flow as read from the log file""" return self.sqlite.get_flow(*args, **kwargs) - def add_flow(self, flow, profileid: str, twid: str, label="benign"): + def add_flow(self, flow, profileid: str, twid: str, label="Benign"): + """ + Just in case, by default if there are no labels in the flow, we consider it Benign + """ # stores it in the db self.sqlite.add_flow(flow, profileid, twid, label=label) # handles the channels and labels etc. From 8ae122121f8d9ccca31942e3d7b7f64cd48c8bad Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:51 +0000 Subject: [PATCH 318/455] Rename var name to be more clear --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index 0d9b11bd27..d22069d9e6 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -119,7 +119,7 @@ def read_configuration(self): self.local_whitelist_path = conf.local_whitelist_path() self.timeformat = conf.ts_format() self.analysis_direction = conf.analysis_direction() - self.label = conf.label() + self.configuration_label = conf.label() self.width = conf.get_tw_width_as_float() self.client_ips: List[ Union[IPv4Network, IPv6Network, IPv4Address, IPv6Address] From 5fbe43ad6bb445795cb8a7c2317cf6b91acecfd0 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:10 +0000 Subject: [PATCH 319/455] Rename var name --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index d22069d9e6..e8fdf5cc56 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -377,7 +377,7 @@ def store_features_going_in(self, profileid: str, twid: str, flow): flow, profileid=profileid, twid=twid, - label=self.label, + label=self.configuration_label, ) self.db.mark_profile_tw_as_modified(profileid, twid, "") From 85ac73dca750a6467e9e345b2daa42ebe4dded90 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:31 +0000 Subject: [PATCH 320/455] Fix processeed flows being zero --- slips/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips/main.py b/slips/main.py index b00cc8f3db..3f661c8843 100644 --- a/slips/main.py +++ b/slips/main.py @@ -414,7 +414,7 @@ def get_analyzed_flows_percentage(self) -> str: self.total_flows = self.db.get_total_flows() flows_percentage = int( - (self.db.get_processed_flows_so_far() / self.total_flows) * 100 + (self.db.get_processed_flows_so_far() / self.total_flows) * 100 if self.total_flows != 0 else 0 ) return f"Analyzed Flows: {green(flows_percentage)}{green('%')}. " From 058b603df40e65aa6dad514fbed6aaa1c9362bcb Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:53 +0000 Subject: [PATCH 321/455] Delete old comments --- modules/flowmldetection/flowmldetection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 766178e127..6c3bfc1275 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -28,10 +28,6 @@ Method, ) -# Only for debbuging -# from matplotlib import pyplot as plt - - # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass From ff9eff155b4989bdecf1b60d34e97f739a5510f7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:13:22 +0000 Subject: [PATCH 322/455] Fix plots --- modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 0b5b5b72ba..359df04eff 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -2,6 +2,8 @@ import matplotlib.pyplot as plt import re import sys +import argparse +import os def plot_log_data(file_path): # Read the log data from the file @@ -24,33 +26,59 @@ def plot_log_data(file_path): "Score": float }) + # Get the directory of the log file to store the plot in the same folder + dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Total labels + # Create the second y-axis for the Background, Benign, Malicious, Total labels ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() - # Save plot to a PNG file - plt.savefig('log_data_plot_with_two_scales.png') + # Adding the legend with increased space for readability + ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') + ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.75) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) # Display the plot plt.show() -# Make sure the file path is passed as an argument -if len(sys.argv) < 2: - print("Please provide the path to the log file as a parameter.") -else: - plot_log_data(sys.argv[1]) +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help + args = parser.parse_args() + + # Call the function to process the log file + plot_log_data(args.log_file) + +if __name__ == "__main__": + main() From e55edf8709ac90ca8e30de4d3bf1d3d381c7ff3b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:14:58 +0000 Subject: [PATCH 323/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 359df04eff..c7f374a7fe 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -40,18 +40,21 @@ def plot_log_data(file_path): ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Background, Benign, Malicious, Total labels + # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() From 5fbff61521b897f5cc047040bbe9adc54eeee126 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:16:23 +0000 Subject: [PATCH 324/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index c7f374a7fe..4099c47c1e 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -42,10 +42,10 @@ def plot_log_data(file_path): # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') # Set appropriate scale for right y-axis based on the data ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) @@ -56,7 +56,7 @@ def plot_log_data(file_path): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') # Adding title and legend - plt.title('Log Data Visualization') + plt.title('Training performance') fig.tight_layout() # Adding the legend with increased space for readability From ff987fc2450326739b4635275f24648799f32659 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:24:43 +0000 Subject: [PATCH 325/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 4099c47c1e..8437e968ac 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -59,12 +59,12 @@ def plot_log_data(file_path): plt.title('Training performance') fig.tight_layout() - # Adding the legend with increased space for readability - ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') - ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) + plt.subplots_adjust(right=0.7) # Save plot to the same folder as the log file plt.savefig(plot_file) From bf9d7200d01b9f941612b2f0a83e308225396ab0 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:02:34 +0000 Subject: [PATCH 326/455] Plot testing performance from a log --- .../plot_testing_performance.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 modules/flowmldetection/plot_testing_performance.py diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py new file mode 100644 index 0000000000..a38c7f0598 --- /dev/null +++ b/modules/flowmldetection/plot_testing_performance.py @@ -0,0 +1,89 @@ +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Create the plot + plt.figure(figsize=(12, 8)) + + # Plot each metric + plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') + plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') + plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') + plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') + plt.plot(F1_values, label='F1 Score', marker='o') + plt.plot(accuracy_values, label='Accuracy', marker='o') + plt.plot(precision_values, label='Precision', marker='o') + plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') + plt.plot(recall_values, label='Recall (TPR)', marker='o') + + # Add labels and title + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title('Evaluation Metrics Over Time') + + # Add a legend + plt.legend() + + # Save the plot as a PNG file + plt.savefig('metrics_plot.png') + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From f146fbf84544323511db94d721e971b6da33ad0f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:04:32 +0000 Subject: [PATCH 327/455] Fix the plot --- modules/flowmldetection/plot_testing_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index a38c7f0598..fac0acd64a 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Set logarithmic scale on the y-axis + plt.yscale('log') + # Add labels and title plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title('Evaluation Metrics Over Time') + plt.ylabel('Metric Value (Log Scale)') + plt.title('Evaluation Metrics Over Time (Log Scale)') # Add a legend plt.legend() # Save the plot as a PNG file - plt.savefig('metrics_plot.png') + plt.savefig('metrics_plot_log_scale.png') plt.close() def main(): From 37bf4f6a0c187b76a443c3f1f855f0278da65065 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:12:40 +0000 Subject: [PATCH 328/455] Fix the plots --- .../plot_testing_performance.py | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index fac0acd64a..5581c72cd4 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -50,33 +50,66 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Create the plot - plt.figure(figsize=(12, 8)) + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } - # Plot each metric - plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') - plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') - plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') - plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') - plt.plot(F1_values, label='F1 Score', marker='o') - plt.plot(accuracy_values, label='Accuracy', marker='o') - plt.plot(precision_values, label='Precision', marker='o') - plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') - plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') - # Set logarithmic scale on the y-axis - plt.yscale('log') + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + +def plot_single_group(metrics_dict, output_filename): + plt.figure(figsize=(12, 8)) - # Add labels and title + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + plt.xlabel('Index') - plt.ylabel('Metric Value (Log Scale)') - plt.title('Evaluation Metrics Over Time (Log Scale)') - - # Add a legend + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') plt.legend() - # Save the plot as a PNG file - plt.savefig('metrics_plot_log_scale.png') + # Save the plot + plt.savefig(output_filename) plt.close() def main(): @@ -85,6 +118,7 @@ def main(): sys.exit(1) file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) From 5936fc882ebfb7a8e82c4b8696891d6ead982194 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:16:50 +0000 Subject: [PATCH 329/455] Fix plot --- .../plot_testing_performance.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 5581c72cd4..8f9e12cd86 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') -def plot_single_group(metrics_dict, output_filename): + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename): # Apply log scale by default plt.yscale('log') + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Manually set more Y-ticks for better visibility + plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 + plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + plt.xlabel('Index') plt.ylabel('Metric Value') plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') From bfc10bea2cf0ec9e6ce3f2a66484cd023f58e4ad Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:22 +0000 Subject: [PATCH 330/455] Fix plots --- modules/flowmldetection/flowmldetection.py | 709 +++++---------------- 1 file changed, 143 insertions(+), 566 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6c3bfc1275..37f0761109 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,566 +1,143 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy -from sklearn.linear_model import SGDClassifier -from sklearn.preprocessing import StandardScaler -import pickle -import pandas as pd -import json -import traceback -import warnings -import os - -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( - Evidence, - ProfileID, - TimeWindow, - Attacker, - ThreatLevel, - EvidenceType, - IoCType, - Direction, - Victim, - Method, -) - -# This horrible hack is only to stop sklearn from printing those warnings -def warn(*args, **kwargs): - pass - - -warnings.warn = warn - - -class FlowMLDetection(IModule): - # Name: short name of the module. Do not use spaces - name = "Flow ML Detection" - description = ( - "Train or test a Machine Learning model to detect malicious flows" - ) - authors = ["Sebastian Garcia"] - - def init(self): - # Subscribe to the channel - self.c1 = self.db.subscribe("new_flow") - self.channels = {"new_flow": self.c1} - self.fieldseparator = self.db.get_field_separator() - # Set the output queue of our database instance - # Read the configuration - self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 - # The scaler trained during training and to use during testing - self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - - def read_configuration(self): - conf = ConfigParser() - self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - - def train(self, sum_labeled_flows): - """ - Train a model based on the flows we receive and the labels - """ - try: - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels - X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - - # Normalize this batch of data so far. This can get progressivle slow - X_flow = self.scaler.fit_transform(X_flow) - - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - - # Train - try: - # Online incremental learning - self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] - ) - except Exception: - self.print("Error while calling clf.train()") - self.print(traceback.format_exc(), 0, 1) - - # See score so far in training - score = self.clf.score(X_flow, y_flow) - - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) - - # Store the models on disk - self.store_model() - - # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") - except Exception: - self.print("Error in train().", 0, 1) - self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") - - def process_features(self, dataset): - """ - Discards some features of the dataset and can create new. - Clean the dataset - """ - try: - # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] - for proto in to_discard: - dataset = dataset[dataset.proto != proto] - - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these - to_drop = [ - "appproto", - "daddr", - "saddr", - "starttime", - "type_", - "smac", - "dmac", - "history", - "uid", - "dir_", - "endtime", - "flow_source", - ] - for field in to_drop: - try: - dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): - pass - - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others - # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) - - # Convert state to categorical - dataset.state = dataset.state.str.replace( - r"(^.*Not Established.*$)", "0", regex=True - ) - dataset.state = dataset.state.str.replace( - r"(^.*Established.*$)", "1", regex=True - ) - - # Convert categories to floats - dataset.state = dataset.state.astype("float64") - - # Convert proto to categorical. For now we only have few states, so we can hardcode... - # We dont use the data to create categories because in testing mode - # we dont see all the protocols - # Also we dont store the Categorizer because the user can retrain - # with its own data. - dataset.proto = dataset.proto.str.lower() - dataset.proto = dataset.proto.str.replace( - r"(^.*tcp.*$)", "0", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*udp.*$)", "1", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp.*$)", "2", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp-ipv6.*$)", "3", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*arp.*$)", "4", regex=True - ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_float: - try: - field = field.astype("float64") - except (ValueError, AttributeError): - pass - - return dataset - except Exception: - # Stop the timer - self.print("Error in process_features()") - self.print(traceback.format_exc(), 0, 1) - - def process_training_flows(self, last_number_of_flows_when_trained): - """ - Process only the new flows in the DB since the last training. - Store the pandas df in self.flows - """ - try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - - # We get all the flows so far - flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB - labels = self.db.get_labels() - if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( - { - "starttime": 1594417039.029793, - "dur": "1.9424750804901123", - "saddr": "10.7.10.101", - "sport": "49733", - "daddr": "40.70.224.145", - "dport": "443", - "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, - "sbytes": 25517, - "dbytes": 17247, - "appproto": "ssl", - "ground_truth_label": "Malicious", - "module_labels": { - "flowalerts-long-connection": "Malicious" - }, - } - ) - new_flows.append( - { - "starttime": 1382355032.706468, - "dur": "10.896695", - "saddr": "147.32.83.52", - "sport": "47956", - "daddr": "80.242.138.72", - "dport": "80", - "proto": "tcp", - "state": "SF", - "spkts": 1, - "dpkts": 0, - "sbytes": 100, - "dbytes": 67596, - "appproto": "http", - "ground_truth_label": "Benign", - "module_labels": { - "flowalerts-long-connection": "Benign" - }, - } - ) - - # Convert to pandas df - df_flows = pd.DataFrame(new_flows) - - # Process features - df_flows = self.process_features(df_flows) - - # Update the flow to the processed version - self.flows = df_flows - except Exception: - self.print("Error in process_flows()") - self.print(traceback.format_exc(), 0, 1) - - def process_flow(self, flow_to_process: dict): - """ - Process one flow. Only used during detection in testing - returns the pandas df with the processed flow - """ - try: - # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) - dflow = self.process_features(raw_flow) - if dflow.empty: - return None - # Update the flow to the processed version - return dflow - except Exception: - # Stop the timer - self.print("Error in process_flow()") - self.print(traceback.format_exc(), 0, 1) - - def detect(self, x_flow) -> Optional[numpy.ndarray]: - """ - Detects the given flow with the current model stored - and returns the predection array - """ - try: - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "endtime", - "flow_source", - "ground_truth_label", - "detailed_ground_truth_label", - ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass - # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) - return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" - ) - self.print(traceback.format_exc(), 0, 1) - - def store_model(self): - """ - Store the trained model on disk - """ - self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: - data = pickle.dumps(self.clf) - f.write(data) - with open(self.scaler_path, "wb") as g: - data = pickle.dumps(self.scaler) - g.write(data) - - def read_model(self): - """ - Read the trained model from disk - """ - try: - self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: - self.clf = pickle.load(f) - self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: - self.scaler = pickle.load(g) - except FileNotFoundError: - # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - except EOFError: - self.print( - "Error reading model from disk. " - "Creating a new empty model.", - 0, - 2, - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - - def set_evidence_malicious_flow(self, flow: dict, twid: str): - confidence: float = 0.1 - description = ( - f"Flow with malicious characteristics by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" - ) - twid_number = int(twid.replace("timewindow", "")) - evidence: Evidence = Evidence( - evidence_type=EvidenceType.MALICIOUS_FLOW, - attacker=Attacker( - direction=Direction.SRC, - ioc_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - ioc_type=IoCType.IP, - value=flow["daddr"], - ), - threat_level=ThreatLevel.LOW, - confidence=confidence, - description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], - timestamp=flow["starttime"], - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], - ) - - self.db.set_evidence(evidence) - - def shutdown_gracefully(self): - # Confirm that the module is done processing - if self.mode == "train": - self.store_model() - - def pre_main(self): - utils.drop_root_privs() - # Load the model - self.read_model() - - def main(self): - if msg := self.get_msg("new_flow"): - # When a new flow arrives - msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] - self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "state": msg["interpreted_state"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) - - if self.mode == "train": - # We are training - - # Is the amount in the DB of labels enough to retrain? - # Use labeled flows - labels = self.db.get_labels() - sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows) - self.last_number_of_flows_when_trained = sum_labeled_flows - - elif self.mode == "test": - # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) - - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: - # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return - - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) - if pred[0] == "Malicious": - # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) - self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 2, - ) +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } + + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): + plt.figure(figsize=(12, 8)) + + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Add more ticks between 0 and 1 (using a logarithmic scale) + plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) + + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.legend() + + # Save the plot + plt.savefig(output_filename) + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 672a109958264697c25f80d7a25881c93752ce2e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:52 +0000 Subject: [PATCH 331/455] Fix plots --- .../plot_testing_performance.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 8f9e12cd86..69b8c96a8c 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['MCC'].append(MCC_values[i]) close_to_1['recall'].append(recall_values[i]) - # Plot metrics for values close to 0 + # Plot metrics for values close to 0 (linear scale) plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + # Plot metrics for values close to 1 (log scale) + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): if 'recall' in metrics_dict: plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - # Apply log scale by default - plt.yscale('log') + # If the plot is close to 1, apply log scale + if not is_close_to_0: + plt.yscale('log') - # If the plot is close to 0, set custom ticks + # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series if is_close_to_0: - # Manually set more Y-ticks for better visibility - plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 - plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) + max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) + + # Avoid log(0), so set the minimum limit a little higher than zero + if min_val == 0: + min_val = 1e-4 # Avoid zero values on the logarithmic scale + + plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From aa87ed17add17251345579b8963bda7230043c6b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:23:02 +0000 Subject: [PATCH 332/455] Fix plots --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 69b8c96a8c..de4ada38b3 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale + min_val = 1e-8 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 148181f2d4f0d08df508dc85b545c5a18f2a6c3b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:25:58 +0000 Subject: [PATCH 333/455] Change plot names --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index de4ada38b3..1b4152c6eb 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") From 057beb3ae401f31c605fe6845957090faec1e195 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:26:09 +0000 Subject: [PATCH 334/455] Rename file --- .../{plot_train_score.py => plot_train_performance.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename modules/flowmldetection/{plot_train_score.py => plot_train_performance.py} (97%) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_performance.py similarity index 97% rename from modules/flowmldetection/plot_train_score.py rename to modules/flowmldetection/plot_train_performance.py index 8437e968ac..80e13e9515 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -28,7 +28,7 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + plot_file = os.path.join(dir_name, 'performance_metrics_training.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) From f8aa2eb76ccca709d051497f7ca76b8316de4a47 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:32 +0000 Subject: [PATCH 335/455] Recover good flowmldetection deleted by mistake --- modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++----- 1 file changed, 566 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 37f0761109..5e4e9aa462 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,143 +1,566 @@ -import matplotlib.pyplot as plt -import sys -import numpy as np - -def process_file(file_path): - # Initialize the counters for the values - FPR_values = [] - FNR_values = [] - TNR_values = [] - TPR_values = [] - F1_values = [] - accuracy_values = [] - precision_values = [] - MCC_values = [] - recall_values = [] - - # Read the file and extract the data - with open(file_path, 'r') as file: - for line in file: - if "TP:" in line: - # Extract the values from the line - parts = line.split(',') - TP = int(parts[0].split(':')[1].strip()) - TN = int(parts[1].split(':')[1].strip()) - FP = int(parts[2].split(':')[1].strip()) - FN = int(parts[3].split(':')[1].strip()) - - # Calculate metrics - FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 - FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 - TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 - TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 - Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 - Recall = TPR # Recall is the same as TPR - F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 - Accuracy = (TP + TN) / (TP + TN + FP + FN) - MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 - - # Append the values to the respective lists - FPR_values.append(FPR) - FNR_values.append(FNR) - TNR_values.append(TNR) - TPR_values.append(TPR) - F1_values.append(F1) - accuracy_values.append(Accuracy) - precision_values.append(Precision) - MCC_values.append(MCC) - recall_values.append(Recall) - - return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values - -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') - - # Print the final values - print("\nFinal Metric Values:") - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): - plt.figure(figsize=(12, 8)) - - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # Apply log scale by default - plt.yscale('log') - - # If the plot is close to 0, set custom ticks - if is_close_to_0: - # Add more ticks between 0 and 1 (using a logarithmic scale) - plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) - - plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') - plt.legend() - - # Save the plot - plt.savefig(output_filename) - plt.close() - -def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - file_path = sys.argv[1] - - FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) - -if __name__ == "__main__": - main() +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler +import pickle +import pandas as pd +import json +import traceback +import warnings +import os + +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.evidence import ( + Evidence, + ProfileID, + TimeWindow, + Attacker, + ThreatLevel, + EvidenceType, + IoCType, + Direction, + Victim, + Method, +) + +# This horrible hack is only to stop sklearn from printing those warnings +def warn(*args, **kwargs): + pass + + +warnings.warn = warn + + +class FlowMLDetection(IModule): + # Name: short name of the module. Do not use spaces + name = "Flow ML Detection" + description = ( + "Train or test a Machine Learning model to detect malicious flows" + ) + authors = ["Sebastian Garcia"] + + def init(self): + # Subscribe to the channel + self.c1 = self.db.subscribe("new_flow") + self.channels = {"new_flow": self.c1} + self.fieldseparator = self.db.get_field_separator() + # Set the output queue of our database instance + # Read the configuration + self.read_configuration() + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained. Used internally only to know + # when to retrain + self.last_number_of_flows_when_trained = 0 + # The scaler trained during training and to use during testing + self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" + + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + + def read_configuration(self): + conf = ConfigParser() + self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves + self.label = conf.label() + + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): + """ + Train a model based on the flows we receive and the labels + """ + try: + # Create X_flow with the current flows minus the label + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) + # Drop the module_labels + X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) + + # Normalize this batch of data so far. This can get progressivle slow + X_flow = self.scaler.fit_transform(X_flow) + + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } + + # Train + try: + # Online incremental learning + self.clf.partial_fit( + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + ) + except Exception: + self.print("Error while calling clf.train()") + self.print(traceback.format_exc(), 0, 1) + + # See score so far in training + score = self.clf.score(X_flow, y_flow) + + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + + # Store the models on disk + self.store_model() + + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + except Exception: + self.print("Error in train().", 0, 1) + self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") + + def process_features(self, dataset): + """ + Discards some features of the dataset and can create new. + Clean the dataset + """ + try: + # Discard some type of flows that dont have ports + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + for proto in to_discard: + dataset = dataset[dataset.proto != proto] + + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + + # For now, discard these + to_drop = [ + "appproto", + "daddr", + "saddr", + "starttime", + "type_", + "smac", + "dmac", + "history", + "uid", + "dir_", + "endtime", + "flow_source", + ] + for field in to_drop: + try: + dataset = dataset.drop(field, axis=1) + except (ValueError, KeyError): + pass + + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others + # So transform here + dataset["state"] = dataset.apply( + lambda row: self.db.get_final_state_from_flags( + row["state"], (row["spkts"] + row["dpkts"]) + ), + axis=1, + ) + + # Convert state to categorical + dataset.state = dataset.state.str.replace( + r"(^.*Not Established.*$)", "0", regex=True + ) + dataset.state = dataset.state.str.replace( + r"(^.*Established.*$)", "1", regex=True + ) + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... + # We dont use the data to create categories because in testing mode + # we dont see all the protocols + # Also we dont store the Categorizer because the user can retrain + # with its own data. + dataset.proto = dataset.proto.str.lower() + dataset.proto = dataset.proto.str.replace( + r"(^.*tcp.*$)", "0", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*udp.*$)", "1", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp.*$)", "2", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp-ipv6.*$)", "3", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*arp.*$)", "4", regex=True + ) + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + + return dataset + except Exception: + # Stop the timer + self.print("Error in process_features()") + self.print(traceback.format_exc(), 0, 1) + + def process_training_flows(self, last_number_of_flows_when_trained): + """ + Process only the new flows in the DB since the last training. + Store the pandas df in self.flows + """ + try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + + # We get all the flows so far + flows = self.db.get_all_flows() + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB + labels = self.db.get_labels() + if len(labels) == 1: + # Insert fake flows for both classes if needed + new_flows.append( + { + "starttime": 1594417039.029793, + "dur": "1.9424750804901123", + "saddr": "10.7.10.101", + "sport": "49733", + "daddr": "40.70.224.145", + "dport": "443", + "proto": "tcp", + "state": "SF", + "spkts": 17, + "dpkts": 27, + "sbytes": 25517, + "dbytes": 17247, + "appproto": "ssl", + "ground_truth_label": "Malicious", + "module_labels": { + "flowalerts-long-connection": "Malicious" + }, + } + ) + new_flows.append( + { + "starttime": 1382355032.706468, + "dur": "10.896695", + "saddr": "147.32.83.52", + "sport": "47956", + "daddr": "80.242.138.72", + "dport": "80", + "proto": "tcp", + "state": "SF", + "spkts": 1, + "dpkts": 0, + "sbytes": 100, + "dbytes": 67596, + "appproto": "http", + "ground_truth_label": "Benign", + "module_labels": { + "flowalerts-long-connection": "Benign" + }, + } + ) + + # Convert to pandas df + df_flows = pd.DataFrame(new_flows) + + # Process features + df_flows = self.process_features(df_flows) + + # Update the flow to the processed version + self.flows = df_flows + except Exception: + self.print("Error in process_flows()") + self.print(traceback.format_exc(), 0, 1) + + def process_flow(self, flow_to_process: dict): + """ + Process one flow. Only used during detection in testing + returns the pandas df with the processed flow + """ + try: + # Convert the flow to a pandas dataframe + raw_flow = pd.DataFrame(flow_to_process, index=[0]) + dflow = self.process_features(raw_flow) + if dflow.empty: + return None + # Update the flow to the processed version + return dflow + except Exception: + # Stop the timer + self.print("Error in process_flow()") + self.print(traceback.format_exc(), 0, 1) + + def detect(self, x_flow) -> Optional[numpy.ndarray]: + """ + Detects the given flow with the current model stored + and returns the predection array + """ + try: + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", + ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass + # Scale the flow + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) + return pred + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) + self.print(traceback.format_exc(), 0, 1) + + def store_model(self): + """ + Store the trained model on disk + """ + self.print("Storing the trained model and scaler on disk.", 0, 2) + with open(self.model_path, "wb") as f: + data = pickle.dumps(self.clf) + f.write(data) + with open(self.scaler_path, "wb") as g: + data = pickle.dumps(self.scaler) + g.write(data) + + def read_model(self): + """ + Read the trained model from disk + """ + try: + self.print("Reading the trained model from disk.", 0, 2) + with open(self.model_path, "rb") as f: + self.clf = pickle.load(f) + self.print("Reading the trained scaler from disk.", 0, 2) + with open(self.scaler_path, "rb") as g: + self.scaler = pickle.load(g) + except FileNotFoundError: + # If there is no model, create one empty + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + except EOFError: + self.print( + "Error reading model from disk. " + "Creating a new empty model.", + 0, + 2, + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + + def set_evidence_malicious_flow(self, flow: dict, twid: str): + confidence: float = 0.1 + description = ( + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" + ) + twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( + evidence_type=EvidenceType.MALICIOUS_FLOW, + attacker=Attacker( + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], + ), + threat_level=ThreatLevel.LOW, + confidence=confidence, + description=description, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], + ) + + self.db.set_evidence(evidence) + + def shutdown_gracefully(self): + # Confirm that the module is done processing + if self.mode == "train": + self.store_model() + + def pre_main(self): + utils.drop_root_privs() + # Load the model + self.read_model() + + def main(self): + if msg := self.get_msg("new_flow"): + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) + + if self.mode == "train": + # We are training + + # Is the amount in the DB of labels enough to retrain? + # Use labeled flows + labels = self.db.get_labels() + sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. + if ( + sum_labeled_flows >= self.minimum_labels_to_start_train + ): + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows(self.last_number_of_flows_when_trained) + # Train an algorithm + self.train(sum_labeled_flows) + self.last_number_of_flows_when_trained = sum_labeled_flows + + elif self.mode == "test": + # We are testing, which means using the model to detect + processed_flow = self.process_flow(self.flow) + + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: + # Predict + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return + + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malicious": + # Generate an alert + self.set_evidence_malicious_flow(self.flow, self.twid) + self.print( + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 2, + ) \ No newline at end of file From f53d7e6c8528af2bf011039e37324b1249bfbaa8 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:43 +0000 Subject: [PATCH 336/455] Fix plot test --- modules/flowmldetection/plot_testing_performance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 1b4152c6eb..977a68b2d5 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-8 # Avoid zero values on the logarithmic scale + min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 6a2c1379d07b8d65b1e9fbbd3c6c64061723f8b7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:50:33 +0000 Subject: [PATCH 337/455] Add testing code to evaluate performance. It is optional with a varible --- modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5e4e9aa462..b17a1baaf0 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -526,36 +526,21 @@ def main(self): elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: + original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) if pred[0] == "Malicious": # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f"Prediction {pred[0]} for label {label}" + f"Prediction {pred[0]} for label {original_label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -563,4 +548,43 @@ def main(self): f'{self.flow["proto"]}', 0, 2, - ) \ No newline at end of file + ) + + # So you can disable this code easily. Since it is used only for evaluating a testing + log_testing_data = True + if log_testing_data: + # Initialize counters if not already done + if not hasattr(self, 'tp'): + self.tp = 0 + if not hasattr(self, 'tn'): + self.tn = 0 + if not hasattr(self, 'fp'): + self.fp = 0 + if not hasattr(self, 'fn'): + self.fn = 0 + + + # Update counters based on predictions and labels + if pred[0] == "Malicious" and original_label == "Malicious": + self.tp += 1 + elif pred[0] == "Benign" and original_label == "Benign": + self.tn += 1 + elif pred[0] == "Malicious" and original_label == "Benign": + self.fp += 1 + elif pred[0] == "Benign" and original_label == "Malicious": + self.fn += 1 + + testing_log_path = "./modules/flowmldetection/testing_performance.log" + try: + with open(testing_log_path, "a") as log_file: + log_file.write("Testing Performance Log Initialized\n") + # Log the testing performance metrics + log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") + + # Log the original flow for false positives and false negatives + if pred[0] == "Malicious" and original_label == "Benign": + log_file.write(f"False Positive Flow: {self.flow}\n") + elif pred[0] == "Benign" and original_label == "Malicious": + log_file.write(f"False Negative Flow: {self.flow}\n") + except Exception as e: + self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From 9fd5cff376977d9a4d970033c5e824d80fed51a6 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:04:00 +0000 Subject: [PATCH 338/455] Fix plots --- .../plot_testing_performance.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 977a68b2d5..6865415cdf 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import sys import numpy as np +import argparse def process_file(file_path): # Initialize the counters for the values @@ -49,7 +50,7 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): # Separate the values into two groups based on their proximity to 0 or 1 close_to_0 = { 'FPR': [], 'FNR': [] @@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) # Print the final values - print("\nFinal Metric Values:") + print("\nFinal Metric Values for Experiment", experiment_number) print(f"Final FPR: {FPR_values[-1]:.4f}") print(f"Final FNR: {FNR_values[-1]:.4f}") print(f"Final TNR: {TNR_values[-1]:.4f}") @@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu print(f"Final MCC: {MCC_values[-1]:.4f}") print(f"Final Recall: {recall_values[-1]:.4f}") -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): +def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + # Add the experiment number to the plot title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') plt.legend() # Save the plot @@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.close() def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) + # Set up argument parsing + parser = argparse.ArgumentParser(description='Plot testing performance metrics.') + parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') + parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') + + args = parser.parse_args() - file_path = sys.argv[1] + file_path = args.file + experiment_number = args.experiment FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) if __name__ == "__main__": main() From 3b88f410d4eebf2c8bc5cc7fc8056756d18d5e73 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:14:51 +0000 Subject: [PATCH 339/455] Fix train plot --- .../flowmldetection/plot_train_performance.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 80e13e9515..244df13d28 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -5,7 +5,7 @@ import argparse import os -def plot_log_data(file_path): +def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() @@ -28,7 +28,8 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + # Append experiment number to the filename + plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) @@ -55,18 +56,18 @@ def plot_log_data(file_path): for i, value in enumerate(df["Total labels"]): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - # Adding title and legend - plt.title('Training performance') + # Adding title and legend with experiment number in title + plt.title(f'Training performance - Experiment {experiment_number}') fig.tight_layout() # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) + plt.subplots_adjust(right=0.75) - # Save plot to the same folder as the log file + # Save plot to the same folder as the log file with experiment number in filename plt.savefig(plot_file) # Display the plot @@ -75,13 +76,14 @@ def plot_log_data(file_path): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") + parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") # Handle -h / --help args = parser.parse_args() # Call the function to process the log file - plot_log_data(args.log_file) + plot_log_data(args.file, args.experiment) if __name__ == "__main__": main() From 9e683fa5a09f6e25d7bc4cd09a382c999400e85b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:14:48 +0000 Subject: [PATCH 340/455] Fix plots --- .../flowmldetection/plot_train_performance.py | 122 ++++++++++-------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 244df13d28..5212dfeeaf 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,85 +4,105 @@ import sys import argparse import os +import matplotlib.ticker as ticker def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + # Regex pattern for the new log format + pattern = ( + r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " + r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " + r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." + ) # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + columns = [ + "Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" + ] + df = pd.DataFrame(data, columns=columns) df = df.astype({ + "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "Total labels": float, - "Score": float + "FPR": float, + "TNR": float, + "TPR": float, + "FNR": float, + "F1": float, + "Precision": float, + "Accuracy": float, + "MCC": float, + "Recall": float, }) - # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - # Append experiment number to the filename - plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + # --- Plot 1: Number of labels (linear scale, no total labels) --- + fig1, ax1 = plt.subplots(figsize=(10, 6)) + ax1.plot(df.index, df["Background"], label="Background", color='black') + ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') + ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend with experiment number in title - plt.title(f'Training performance - Experiment {experiment_number}') - fig.tight_layout() + ax1.set_ylabel('Label Counts') + # No log scale here + ax1.set_title(f'Label Counts - Experiment {experiment_number}') + ax1.legend() + ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + + # --- Plot 2: FNR and FPR (log scale) --- + fig2, ax2 = plt.subplots(figsize=(10, 6)) + ax2.plot(df.index, df["FNR"], label="FNR", color='red') + ax2.plot(df.index, df["FPR"], label="FPR", color='blue') + ax2.set_xlabel('Index') + ax2.set_ylabel('Rate') + ax2.set_yscale('log') + ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') + ax2.legend() + ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + + # --- Plot 3: Other metrics (log scale) --- + fig3, ax3 = plt.subplots(figsize=(12, 7)) + metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] + colors_rest = [ + 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', + 'tab:gray', 'tab:pink', 'tab:olive' + ] + for metric, color in zip(metrics_rest, colors_rest): + ax3.plot(df.index, df[metric], label=metric, color=color) + ax3.set_xlabel('Index') + ax3.set_ylabel('Metric Value') + ax3.set_yscale('log') + ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') + ax3.legend() + ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) - - # Save plot to the same folder as the log file with experiment number in filename - plt.savefig(plot_file) - - # Display the plot plt.show() + # --- Print final values in terminal --- + print("\nFinal values at last training step:") + for col in ["Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: + print(f"{col}: {df[col].iloc[-1]}") + def main(): - # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") - - # Handle -h / --help args = parser.parse_args() - - # Call the function to process the log file plot_log_data(args.file, args.experiment) if __name__ == "__main__": From 632ddbcd650375a5b6a41d0bb724c20fd3766e4f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:16:01 +0000 Subject: [PATCH 341/455] Add performance metrics to the training evaluation --- modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++----- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b17a1baaf0..2c60cd4034 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,16 @@ import json import traceback import warnings -import os +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import ( + confusion_matrix, + f1_score, + precision_score, + accuracy_score, + matthews_corrcoef, + recall_score, +) + from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -86,21 +95,21 @@ def write_to_training_log(self, message: str): except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows): + def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ Train a model based on the flows we receive and the labels """ try: + # Create y_flow with the label + y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - # Normalize this batch of data so far. This can get progressivle slow + # Normalize this batch of data so far. This can get progressively slow X_flow = self.scaler.fit_transform(X_flow) # Count the number of labels of each type in this epoc @@ -120,18 +129,43 @@ def train(self, sum_labeled_flows): self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # See score so far in training - score = self.clf.score(X_flow, y_flow) + # Predict on the training data + y_pred = self.clf.predict(X_flow) - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + # For metrics, let's focus on Malicious vs Benign (ignore Background) + mask = (y_flow == "Malicious") | (y_flow == "Benign") + y_true_bin = y_flow[mask] + y_pred_bin = y_pred[mask] + + # Map to binary: Malicious=1, Benign=0 + y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) + y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + + # Compute confusion matrix: tn, fp, fn, tp + tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + + # Compute metrics + FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 + TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 + TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 + FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 + F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) + PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) + ACCU = accuracy_score(y_true_bin, y_pred_bin) + MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk self.store_model() # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + self.write_to_training_log( + f"Total labels: {sum_labeled_flows}, " + f"Background: {epoch_label_counts['Background']}. " + f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + ) except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) @@ -520,7 +554,7 @@ def main(self): # for pandas self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train(sum_labeled_flows) + self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From 1d3346dbeb3653238427b291b9b8d90e01a2f578 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sun, 4 May 2025 12:50:46 +0000 Subject: [PATCH 342/455] Fix experiment names --- modules/flowmldetection/plot_train_performance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 5212dfeeaf..304f0f4ead 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number): ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') ax1.set_ylabel('Label Counts') - # No log scale here ax1.set_title(f'Label Counts - Experiment {experiment_number}') ax1.legend() ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) # --- Plot 2: FNR and FPR (log scale) --- fig2, ax2 = plt.subplots(figsize=(10, 6)) @@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number): ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') ax2.legend() ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) # --- Plot 3: Other metrics (log scale) --- fig3, ax3 = plt.subplots(figsize=(12, 7)) @@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number): ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') ax3.legend() ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) plt.show() From 36129e51da4879ee590f2c76ad502372fb6954e7 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 343/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 480 +++++++++++---------- 1 file changed, 254 insertions(+), 226 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2c60cd4034..16b67e9038 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,16 +10,7 @@ import json import traceback import warnings -from sklearn.metrics import classification_report, confusion_matrix -from sklearn.metrics import ( - confusion_matrix, - f1_score, - precision_score, - accuracy_score, - matthews_corrcoef, - recall_score, -) - +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -37,6 +28,10 @@ Method, ) +# Only for debbuging +# from matplotlib import pyplot as plt + + # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass @@ -61,115 +56,206 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 + # Minum amount of new lables needed to trigger the train + self.minimum_lables_to_retrain = 50 + # To plot the scores of training + # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows, last_number_of_flows_when_trained): + def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Create y_flow with the label - y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels + # Process the labels to have only Normal and Malware + self.flows.label = self.flows.label.str.replace( + r"(^.*ormal.*$)", "Normal", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alware.*$)", "Malware", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alicious.*$)", "Malware", regex=True + ) + + # Separate + y_flow = self.flows["label"] + X_flow = self.flows.drop("label", axis=1) X_flow = X_flow.drop("module_labels", axis=1) - # Normalize this batch of data so far. This can get progressively slow + # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - # Train try: - # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + X_flow, y_flow, classes=["Malware", "Normal"] ) except Exception: self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # Predict on the training data - y_pred = self.clf.predict(X_flow) + # See score so far in training + score = self.clf.score(X_flow, y_flow) - # For metrics, let's focus on Malicious vs Benign (ignore Background) - mask = (y_flow == "Malicious") | (y_flow == "Benign") - y_true_bin = y_flow[mask] - y_pred_bin = y_pred[mask] + # To debug the training score + # self.scores.append(score) - # Map to binary: Malicious=1, Benign=0 - y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) - y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + self.print(f" Training Score: {score}", 0, 1) + # self.print(f' Model Parameters: {self.clf.coef_}') - # Compute confusion matrix: tn, fp, fn, tp - tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) - - # Compute metrics - FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 - TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 - TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 - FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 - F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) - PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) - ACCU = accuracy_score(y_true_bin, y_pred_bin) - MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 - RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) + # Debug code to store a plot in a png of the scores + # plt.plot(self.scores) + # plt.savefig('train-scores.png') # Store the models on disk self.store_model() - # Log training information - self.write_to_training_log( - f"Total labels: {sum_labeled_flows}, " - f"Background: {epoch_label_counts['Background']}. " - f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " - f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " - f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." - ) except Exception: - self.print("Error in train().", 0, 1) + self.print("Error in train()", 0, 1) + self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ @@ -182,11 +268,6 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - # For now, discard these to_drop = [ "appproto", @@ -199,7 +280,9 @@ def process_features(self, dataset): "history", "uid", "dir_", + "dbytes", "endtime", + "bytes", "flow_source", ] for field in to_drop: @@ -208,16 +291,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -251,11 +330,7 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ + fields_to_convert_to_flow = [ dataset.proto, dataset.dport, dataset.sport, @@ -266,10 +341,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_float: + for field in fields_to_convert_to_flow: try: field = field.astype("float64") - except (ValueError, AttributeError): + except ValueError: pass return dataset @@ -278,72 +353,69 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self, last_number_of_flows_when_trained): + def process_flows(self): """ - Process only the new flows in the DB since the last training. + Process all the flwos in the DB Store the pandas df in self.flows """ try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - # We get all the flows so far + # because this retraining happens in batches flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB + # Check how many different labels are in the DB + # We need both normal and malware labels = self.db.get_labels() if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( + # Only 1 label has flows + # There are not enough different labels, so insert two flows + # that are fake but representative of a normal and malware flow + # they are only for the training process + # At least 1 flow of each label is required + # self.print(f'Amount of labeled flows: {labels}', 0, 1) + flows.append( { - "starttime": 1594417039.029793, + "ts": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, + "state": "Established", + "allbytes": 42764, + "spkts": 37, "sbytes": 25517, - "dbytes": 17247, "appproto": "ssl", - "ground_truth_label": "Malicious", + "label": "Malware", "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": "Malware" }, } ) - new_flows.append( + flows.append( { - "starttime": 1382355032.706468, + "ts": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "SF", + "state": "Established", + "allbytes": 67696, "spkts": 1, - "dpkts": 0, "sbytes": 100, - "dbytes": 67596, "appproto": "http", - "ground_truth_label": "Benign", + "label": "Normal", "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": "Normal" }, } ) + # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(new_flows) + df_flows = pd.DataFrame(flows) # Process features df_flows = self.process_features(df_flows) @@ -351,6 +423,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: + # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) @@ -363,8 +436,6 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) - if dflow.empty: - return None # Update the flow to the processed version return dflow except Exception: @@ -378,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: + given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -385,28 +457,14 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", + "dbytes", + "dpkts", "endtime", + "bytes", "flow_source", - "ground_truth_label", + "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -418,7 +476,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" + f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -510,16 +568,18 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - # When a new flow arrives msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] + twid = msg["twid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original + # these fields are expected in testing. update the original # flow dict to have them self.flow.update( { + "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), + # the flow["state"] is the origstate, we dont need that here + # we need the interpreted state "state": msg["interpreted_state"], + "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -532,49 +592,56 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_labels_to_start_train + sum_labeled_flows >= self.minimum_lables_to_retrain + and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) - self.last_number_of_flows_when_trained = sum_labeled_flows - + # We get here every 'self.minimum_lables_to_retrain' + # amount of labels + # So for example we retrain every 100 labels and only when + # we have at least 100 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_flows() + # Train an algorithm + self.train() elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) + # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: - original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - if pred[0] == "Malicious": + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Report Prediction {pred[0]} for label" + f' {label} flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) + self.set_evidence_malicious_flow(self.flow, twid) self.print( - f"Prediction {pred[0]} for label {original_label}" + f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -583,42 +650,3 @@ def main(self): 0, 2, ) - - # So you can disable this code easily. Since it is used only for evaluating a testing - log_testing_data = True - if log_testing_data: - # Initialize counters if not already done - if not hasattr(self, 'tp'): - self.tp = 0 - if not hasattr(self, 'tn'): - self.tn = 0 - if not hasattr(self, 'fp'): - self.fp = 0 - if not hasattr(self, 'fn'): - self.fn = 0 - - - # Update counters based on predictions and labels - if pred[0] == "Malicious" and original_label == "Malicious": - self.tp += 1 - elif pred[0] == "Benign" and original_label == "Benign": - self.tn += 1 - elif pred[0] == "Malicious" and original_label == "Benign": - self.fp += 1 - elif pred[0] == "Benign" and original_label == "Malicious": - self.fn += 1 - - testing_log_path = "./modules/flowmldetection/testing_performance.log" - try: - with open(testing_log_path, "a") as log_file: - log_file.write("Testing Performance Log Initialized\n") - # Log the testing performance metrics - log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") - - # Log the original flow for false positives and false negatives - if pred[0] == "Malicious" and original_label == "Benign": - log_file.write(f"False Positive Flow: {self.flow}\n") - elif pred[0] == "Benign" and original_label == "Malicious": - log_file.write(f"False Negative Flow: {self.flow}\n") - except Exception as e: - self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From a9a38be1d23ebb45330d8bc616c9701c5181db61 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 344/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 96e0e65f772b4d7542b762fb500b73aff90b262b Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 345/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 5d655d2d2d16440bc9bf6eb07262cbbba7bddb3d Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 346/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 16b67e9038..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 1d339685f8..568e78ff45 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 8cd019f174817eee464c90c05ba2a3d60365a852 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 347/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From fdfd7fa0e06079e258530995ee65436f0f56bbf9 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 348/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5a5b751e2a4491b5cac57dfe3be26643d9d19b26 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 349/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 2400ee226cf7d7678e06570988af29782c1eec10 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 350/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 457cf59da0f4e4be130f661a5eefb01b01c238d4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 351/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From c35018ef7db18a9cb3b8facaee69b1dc3ec58479 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 352/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 311e8de82f933c87f1d079613ebf2c8fd5e1a5c9 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 353/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 75bb4ea33838004df0241d5c68561b77f642e3de Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 354/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 6be900429ac675632a0d35e137f45bcb025a12f1 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 355/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..f052931c89 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e08f2903f4a43ae0ccdbce860e8e0639525ad2f7 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 356/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c89..3379f5077f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5de25cdb8e5b0d027fbc3df2f8f0467c2a53d489 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 357/455] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 568e78ff45..1d339685f8 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 2b614c84fb077b37ecff4613981bc5e7bc031574 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 358/455] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077f..c06755a599 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 7bce2ca4fc01178dddafb04b4dcb64a8295e142c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 359/455] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115bd..0000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 62cf6cd7fd287ff669faa225e315eed8ef045b73 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 360/455] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c06755a599..87e07c7592 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From 4c8f42673eac97e521e16d94d3bbbe03138d3e4f Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 361/455] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 87e07c7592..e91495d649 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", @@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -437,18 +441,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -461,23 +463,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -497,8 +507,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -506,9 +516,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 7a1e10fb8a2e19c8a158e05aa9c9fda0157cdbd6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 362/455] Fix the profiler handler for cases of nan in state --- slips_files/core/database/redis_db/profile_handler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index a6669c92a9..ab53cc4ab0 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -423,6 +423,7 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus + # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From c76c96344d42a17d3c3e5d51c868abe3896e5d76 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 363/455] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e91495d649..58b4ce1e4c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From 74007e82690dbbd14787bd237f37e5507ca62b90 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 364/455] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 58b4ce1e4c..4a4d46e376 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From deefde05178f98f7b1ef9ee9c7b54c6b549b0f5b Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 365/455] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1124 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 130 zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KL^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8 z4q_05T(v)g91VHfmFeIMvRKFpJJ~89v lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl# diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch delta 290 zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGcfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXo Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 366/455] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 367/455] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4a4d46e376..d8e9ada27c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From b558c05d455ee9651e29e7eef3d4045ad1241ade Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 368/455] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 369/455] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 4a448bc3b8ece80ad6b783d0809e6c93ad0c452e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 370/455] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From a2b5b9917a802f3810fa3c7b4719e69dfbb1b37c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 371/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 319 +++++++++++++-------- 1 file changed, 207 insertions(+), 112 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d8e9ada27c..1fa77de01c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -55,12 +56,8 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained - self.last_number_of_flows_when_trained = 0 + # Minum amount of new lables needed to trigger the train + self.minimum_lables_to_retrain = 50 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -71,25 +68,26 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() - self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) + # Process the labels to have only Normal and Malware + self.flows.label = self.flows.label.str.replace( + r"(^.*ormal.*$)", "Normal", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alware.*$)", "Malware", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alicious.*$)", "Malware", regex=True + ) - # Create X_flow with the current flows minus the label + # Separate + y_flow = self.flows["label"] X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) - # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -98,7 +96,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Malware", "Normal"] ) except Exception: self.print("Error while calling clf.train()") @@ -121,7 +119,142 @@ def train(self): self.store_model() except Exception: - self.print("Error in train().", 0, 1) + self.print("Error in train()", 0, 1) + self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -135,12 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these + # For now, discard the ports to_drop = [ "appproto", "daddr", @@ -152,7 +280,9 @@ def process_features(self, dataset): "history", "uid", "dir_", + "dbytes", "endtime", + "bytes", "flow_source", ] for field in to_drop: @@ -161,16 +291,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -204,11 +330,7 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ + fields_to_convert_to_flow = [ dataset.proto, dataset.dport, dataset.sport, @@ -219,10 +341,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_float: + for field in fields_to_convert_to_flow: try: field = field.astype("float64") - except (ValueError, AttributeError): + except ValueError: pass return dataset @@ -231,9 +353,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_flows(self): """ - Process all the flows in the DB + Process all the flwos in the DB Store the pandas df in self.flows """ try: @@ -249,48 +371,44 @@ def process_training_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. + # self.print(f'Amount of labeled flows: {labels}', 0, 1) flows.append( { - "starttime": 1594417039.029793, + "ts": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, + "state": "Established", + "allbytes": 42764, + "spkts": 37, "sbytes": 25517, - "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "label": "Malware", "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": "Malware" }, } ) flows.append( { - "starttime": 1382355032.706468, + "ts": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "SF", + "state": "Established", + "allbytes": 67696, "spkts": 1, - "dpkts": 0, "sbytes": 100, - "dbytes": 67596, "appproto": "http", - "label": "Benign", + "label": "Normal", "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": "Normal" }, } ) @@ -318,8 +436,6 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) - if dflow.empty: - return None # Update the flow to the processed version return dflow except Exception: @@ -333,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: + given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -340,28 +457,12 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", + "dbytes", + "dpkts", "endtime", + "bytes", "flow_source", - "ground_truth_label", # todo now we can use them - "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -373,7 +474,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" + f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -465,16 +566,18 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - # When a new flow arrives msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] + twid = msg["twid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original + # these fields are expected in testing. update the original # flow dict to have them self.flow.update( { + "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), + # the flow["state"] is the origstate, we dont need that here + # we need the interpreted state "state": msg["interpreted_state"], + "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -487,31 +590,23 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_labels_to_start_train + sum_labeled_flows >= self.minimum_lables_to_retrain + and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows() - # Train an algorithm - self.train() - self.last_number_of_flows_when_trained = sum_labeled_flows - + # We get here every 'self.minimum_lables_to_retrain' + # amount of labels + # So for example we retrain every 100 labels and only when + # we have at least 100 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_flows() + # Train an algorithm + self.train() elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -531,8 +626,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' + f"Report Prediction {pred[0]} for label" + f' {label} flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -540,9 +635,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malicious": + if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) + self.set_evidence_malicious_flow(self.flow, twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 5df2e70c0ea96004493eca3423768d6ab4347cab Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 372/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 0000000000..b671a09a28 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 92316cf2520fa980dcc14d808a1393e7e0968eb5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 373/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a28..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From eb778265b8d6f98c27489081a478a2b0ae744da0 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 374/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 1fa77de01c..0e7c4b78e5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 1d339685f8..568e78ff45 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 28d2199e094edbaab33620c6cd8c56252d67c0be Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 375/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0e7c4b78e5..19e829e11b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From cbe80f8e80d05d147a1e54544f01ee4b2ab18cab Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 376/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 19e829e11b..0e7c4b78e5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From aa68a909bb8309e70b15ca70958076a368dbe0c7 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 377/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0e7c4b78e5..19e829e11b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From aee1e13912d8bf414b5f924e6684187b7c114a68 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 378/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 19e829e11b..0e7c4b78e5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From fc14125fe16615de2e29e40fc98e215bd4648bbd Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 379/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0e7c4b78e5..19e829e11b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 9c95c76b54f429f9eaf2c8035d60b98f5bf8dffe Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 380/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 19e829e11b..0e7c4b78e5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 1b20f2ab937725762ca307dee70a3cb517d8d579 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 381/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 165 +++++++++++++++++++-- 1 file changed, 150 insertions(+), 15 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0e7c4b78e5..c8226368c7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -8,6 +8,7 @@ import pickle import pandas as pd import json +import datetime import traceback import warnings import sys @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -133,7 +269,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -156,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -393,21 +524,25 @@ def read_model(self): def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 description = ( - f"Flow with malicious characteristics by ML. Src IP" + f"Malicious flow by ML. Src IP" f" {flow['saddr']}:{flow['sport']} to " f"{flow['daddr']}:{flow['dport']}" ) + + timestamp = utils.convert_format( + datetime.datetime.now(), utils.alerts_format + ) twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( direction=Direction.SRC, - ioc_type=IoCType.IP, + attacker_type=IoCType.IP, value=flow["saddr"], ), victim=Victim( direction=Direction.DST, - ioc_type=IoCType.IP, + victim_type=IoCType.IP, value=flow["daddr"], ), threat_level=ThreatLevel.LOW, @@ -416,7 +551,7 @@ def set_evidence_malicious_flow(self, flow: dict, twid: str): profile=ProfileID(ip=flow["saddr"]), timewindow=TimeWindow(twid_number), uid=[flow["uid"]], - timestamp=flow["starttime"], + timestamp=timestamp, method=Method.AI, src_port=flow["sport"], dst_port=flow["dport"], From 2b9ed84a6a2bdbe9a2ec8a109da92df4d627b994 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 382/455] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115bd..43d9b5461e 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,4 +1,6 @@ from typing import Optional +import sys +import traceback def interpret_suricata_states(state) -> Optional[str]: From 736cf0b76411e510c34b586f644895cbf9250e75 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 383/455] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index 43d9b5461e..d0a05115bd 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,6 +1,4 @@ from typing import Optional -import sys -import traceback def interpret_suricata_states(state) -> Optional[str]: From 2b576c42258e49f2bdcc008964e04e35b7aeb972 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 384/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c7..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 47d05a060ed6f78fb47892d9756998e775e05b94 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 385/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..94eb27afdf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e197df04e3e44f4318289706ede7a3483ec7feb2 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 386/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 94eb27afdf..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From d95f4c938e6fdf0ca5bf7ccd607cfb71e2a34c34 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 387/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 150 +++++++++++++++++++-- 1 file changed, 140 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..c8226368c7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -157,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From c9d2395cd1bfd3f19b1ec80bbde1a6b322e866f5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 388/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c7..9af514a709 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From f6de6fe7db854dcd9ee932e602b7d15af93f80cd Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 389/455] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 438 +++++++++++++-------- 1 file changed, 278 insertions(+), 160 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a709..124ec61f91 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,8 +1,3 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -10,13 +5,10 @@ import json import datetime import traceback -import warnings import sys -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( +from slips_files.common.imports import * +from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, TimeWindow, @@ -25,8 +17,7 @@ EvidenceType, IoCType, Direction, - Victim, - Method, + IDEACategory, ) # Only for debbuging @@ -38,6 +29,8 @@ def warn(*args, **kwargs): pass +import warnings + warnings.warn = warn @@ -63,8 +56,6 @@ def init(self): # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() @@ -122,6 +113,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -130,7 +256,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -139,35 +265,28 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "starttime", + "ts", + "origstate", "type_", - "smac", - "dmac", - "history", - "uid", "dir_", + "history", "dbytes", - "endtime", - "bytes", - "flow_source", + "dpkts", + "smac", + "dmac", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): + except ValueError: pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -201,23 +320,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_flow: - try: - field = field.astype("float64") - except ValueError: - pass - + dataset.proto = dataset.proto.astype("float64") + try: + # Convert dport to float + dataset.dport = dataset.dport.astype("float") + except ValueError: + pass + try: + # Convert sport to float + dataset.sport = dataset.sport.astype("float") + except ValueError: + pass + try: + # Convert Dur to float + dataset.dur = dataset.dur.astype("float") + except ValueError: + pass + try: + # Convert TotPkts to float + dataset.pkts = dataset.pkts.astype("float") + except ValueError: + pass + try: + # Convert SrcPkts to float + dataset.spkts = dataset.spkts.astype("float") + except ValueError: + pass + try: + # Convert TotBytes to float + dataset.allbytes = dataset.allbytes.astype("float") + except ValueError: + pass + try: + # Convert SrcBytes to float + dataset.sbytes = dataset.sbytes.astype("float") + except ValueError: + pass return dataset except Exception: # Stop the timer @@ -233,6 +371,7 @@ def process_flows(self): # We get all the flows so far # because this retraining happens in batches flows = self.db.get_all_flows() + # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -252,7 +391,9 @@ def process_flows(self): "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 84, "allbytes": 42764, "spkts": 37, "sbytes": 25517, @@ -272,7 +413,9 @@ def process_flows(self): "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 67, "allbytes": 67696, "spkts": 1, "sbytes": 100, @@ -298,55 +441,42 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self, flow_to_process: dict): + def process_flow(self): """ Process one flow. Only used during detection in testing - returns the pandas df with the processed flow + Store the pandas df in self.flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) + raw_flow = pd.DataFrame(self.flow_dict, index=[0]) + # Process features dflow = self.process_features(raw_flow) # Update the flow to the processed version - return dflow + self.flow = dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self, x_flow) -> Optional[numpy.ndarray]: + def detect(self): """ - Detects the given flow with the current model stored - and returns the predection array + Detect this flow with the current model stored """ try: - given_x_flow = x_flow - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "dbytes", - "dpkts", - "endtime", - "bytes", - "flow_source", - ] - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass + # Store the real label if there is one + y_flow = self.flow["label"] + # remove the real label column + self.flow = self.flow.drop("label", axis=1) + # remove the label predictions column of the other modules + X_flow = self.flow.drop("module_labels", axis=1) # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) + X_flow = self.scaler.transform(X_flow) + pred = self.clf.predict(X_flow) return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" - ) + except Exception: + # Stop the timer + self.print("Error in detect() X_flow:") + self.print(X_flow) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -354,10 +484,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: + with open("./modules/flowmldetection/model.bin", "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open(self.scaler_path, "wb") as g: + with open("./modules/flowmldetection/scaler.bin", "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -367,23 +497,20 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: + with open("./modules/flowmldetection/model.bin", "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: + with open("./modules/flowmldetection/scaler.bin", "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) + self.print("There was no model. Creating a new empty model.", 0, 2) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. " - "Creating a new empty model.", + "Error reading model from disk. Creating a new empty model.", 0, 2, ) @@ -391,40 +518,39 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow(self, flow: dict, twid: str): + def set_evidence_malicious_flow( + self, + saddr: str, + sport: str, + daddr: str, + dport: str, + twid: str, + uid: str, + ): confidence: float = 0.1 + ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" + f"Malicious flow by ML. Src IP {saddr}:{sport} to " + f"{daddr}:{dport} {ip_identification}" ) timestamp = utils.convert_format( datetime.datetime.now(), utils.alerts_format ) - twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, - attacker_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - victim_type=IoCType.IP, - value=flow["daddr"], + direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], + profile=ProfileID(ip=saddr), + timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), + uid=[uid], timestamp=timestamp, - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], + category=IDEACategory.ANOMALY_TRAFFIC, ) self.db.set_evidence(evidence) @@ -441,22 +567,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - msg = json.loads(msg["data"]) - twid = msg["twid"] - self.flow = msg["flow"] - # these fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state - "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) + data = msg["data"] + # Convert from json to dict + data = json.loads(data) + profileid = data["profileid"] + twid = data["twid"] + # Get flow that is now in json format + flow = data["flow"] + # Convert flow to a dict + flow = json.loads(flow) + # Convert the common fields to something that can + # be interpreted + # Get the uid which is the key + uid = next(iter(flow)) + self.flow_dict = json.loads(flow[uid]) if self.mode == "train": # We are training @@ -469,57 +593,51 @@ def main(self): sum_labeled_flows >= self.minimum_lables_to_retrain and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels + # We get here every 'self.minimum_lables_to_retrain' amount of labels + # So for example we retrain every 100 labels and only when we have at least 100 labels self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." + f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." ) - # Process all flows in the DB and make them ready - # for pandas + # Process all flows in the DB and make them ready for pandas self.process_flows() # Train an algorithm self.train() elif self.mode == "test": # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) + self.process_flow() - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: + # After processing the flow, it may happen that we delete icmp/arp/etc + # so the dataframe can be empty + if self.flow is not None and not self.flow.empty: # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return + pred = self.detect() + label = self.flow_dict["label"] - label = self.flow["label"] + # Report if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode + # If the user specified a label in test mode, and the label + # is diff from the prediction, print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 3, ) if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow( + self.flow_dict["saddr"], + self.flow_dict["sport"], + self.flow_dict["daddr"], + self.flow_dict["dport"], + twid, + uid, + ) self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 2, ) From 1b46d82aa527373f28ad89932d12fbf7775a8561 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 390/455] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 169 +++------------------ 1 file changed, 19 insertions(+), 150 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 124ec61f91..c57a7a3581 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,9 +5,13 @@ import json import datetime import traceback -import sys +import warnings + -from slips_files.common.imports import * +from slips_files.common.state_handler import get_final_state_from_flags +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, @@ -29,8 +33,6 @@ def warn(*args, **kwargs): pass -import warnings - warnings.warn = warn @@ -113,141 +115,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -281,12 +148,17 @@ def process_features(self, dataset): except ValueError: pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -370,7 +242,7 @@ def process_flows(self): try: # We get all the flows so far # because this retraining happens in batches - flows = self.db.get_all_flows() + flows: list = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware @@ -464,7 +336,7 @@ def detect(self): """ try: # Store the real label if there is one - y_flow = self.flow["label"] + # y_flow = self.flow["label"] # remove the real label column self.flow = self.flow.drop("label", axis=1) # remove the label predictions column of the other modules @@ -568,13 +440,10 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): data = msg["data"] - # Convert from json to dict data = json.loads(data) - profileid = data["profileid"] + # profileid = data["profileid"] twid = data["twid"] - # Get flow that is now in json format flow = data["flow"] - # Convert flow to a dict flow = json.loads(flow) # Convert the common fields to something that can # be interpreted From 299d2ab8fd04e70a3a7b4f9bc287a3a642faf542 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:36:55 +0200 Subject: [PATCH 391/455] mlflow. Ignore UID column --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c57a7a3581..e2aa1e0ee3 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -141,6 +141,7 @@ def process_features(self, dataset): "dpkts", "smac", "dmac", + "uid", ] for field in to_drop: try: From 06bbbcfd5bdbefc4da9940c62949a5178fe58209 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 392/455] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 568e78ff45..1d339685f8 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 98e29a6c43277e0577924a1d8c130f300c3cdca2 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 393/455] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115bd..0000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 045947ffdfb935b57f705baba86df81216eef573 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 394/455] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e2aa1e0ee3..9269b67012 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -154,7 +154,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From e793c517a247a98ea25d278c35f38c9e16c8772d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 395/455] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 378 +++++++++++---------- 1 file changed, 197 insertions(+), 181 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9269b67012..1cfbaf925d 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,18 +1,20 @@ +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle import pandas as pd import json -import datetime import traceback import warnings - -from slips_files.common.state_handler import get_final_state_from_flags from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule -from slips_files.core.evidence_structure.evidence import ( +from slips_files.core.structures.evidence import ( Evidence, ProfileID, TimeWindow, @@ -21,7 +23,8 @@ EvidenceType, IoCType, Direction, - IDEACategory, + Victim, + Method, ) # Only for debbuging @@ -52,36 +55,41 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -90,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -113,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -123,7 +131,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -132,21 +140,20 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "ts", - "origstate", + "starttime", "type_", - "dir_", - "history", - "dbytes", - "dpkts", "smac", "dmac", + "history", "uid", + "dir_", + "endtime", + "flow_source", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except ValueError: + except (ValueError, KeyError): pass # When flows are read from Slips sqlite, @@ -155,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -193,58 +199,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - dataset.proto = dataset.proto.astype("float64") - try: - # Convert dport to float - dataset.dport = dataset.dport.astype("float") - except ValueError: - pass - try: - # Convert sport to float - dataset.sport = dataset.sport.astype("float") - except ValueError: - pass - try: - # Convert Dur to float - dataset.dur = dataset.dur.astype("float") - except ValueError: - pass - try: - # Convert TotPkts to float - dataset.pkts = dataset.pkts.astype("float") - except ValueError: - pass - try: - # Convert SrcPkts to float - dataset.spkts = dataset.spkts.astype("float") - except ValueError: - pass - try: - # Convert TotBytes to float - dataset.allbytes = dataset.allbytes.astype("float") - except ValueError: - pass - try: - # Convert SrcBytes to float - dataset.sbytes = dataset.sbytes.astype("float") - except ValueError: - pass + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + return dataset except Exception: # Stop the timer self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: # We get all the flows so far # because this retraining happens in batches - flows: list = self.db.get_all_flows() - + flows = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -254,48 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 84, - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 67, - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -314,42 +304,51 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self): + def process_flow(self, flow_to_process: dict): """ Process one flow. Only used during detection in testing - Store the pandas df in self.flow + returns the pandas df with the processed flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(self.flow_dict, index=[0]) - # Process features + raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) # Update the flow to the processed version - self.flow = dflow + return dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self): + def detect(self, x_flow) -> Optional[numpy.ndarray]: """ - Detect this flow with the current model stored + Detects the given flow with the current model stored + and returns the predection array """ try: - # Store the real label if there is one - # y_flow = self.flow["label"] - # remove the real label column - self.flow = self.flow.drop("label", axis=1) - # remove the label predictions column of the other modules - X_flow = self.flow.drop("module_labels", axis=1) + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + ] + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass # Scale the flow - X_flow = self.scaler.transform(X_flow) - pred = self.clf.predict(X_flow) + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) return pred - except Exception: - # Stop the timer - self.print("Error in detect() X_flow:") - self.print(X_flow) + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -357,10 +356,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "wb") as f: + with open(self.model_path, "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open("./modules/flowmldetection/scaler.bin", "wb") as g: + with open(self.scaler_path, "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -370,20 +369,23 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "rb") as f: + with open(self.model_path, "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open("./modules/flowmldetection/scaler.bin", "rb") as g: + with open(self.scaler_path, "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print("There was no model. Creating a new empty model.", 0, 2) + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. Creating a new empty model.", + "Error reading model from disk. " + "Creating a new empty model.", 0, 2, ) @@ -391,39 +393,36 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow( - self, - saddr: str, - sport: str, - daddr: str, - dport: str, - twid: str, - uid: str, - ): + def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 - ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP {saddr}:{sport} to " - f"{daddr}:{dport} {ip_identification}" - ) - - timestamp = utils.convert_format( - datetime.datetime.now(), utils.alerts_format + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" ) - + twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=saddr), - timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), - uid=[uid], - timestamp=timestamp, - category=IDEACategory.ANOMALY_TRAFFIC, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], ) self.db.set_evidence(evidence) @@ -440,17 +439,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - data = msg["data"] - data = json.loads(data) - # profileid = data["profileid"] - twid = data["twid"] - flow = data["flow"] - flow = json.loads(flow) - # Convert the common fields to something that can - # be interpreted - # Get the uid which is the key - uid = next(iter(flow)) - self.flow_dict = json.loads(flow[uid]) + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) if self.mode == "train": # We are training @@ -459,55 +461,69 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' amount of labels - # So for example we retrain every 100 labels and only when we have at least 100 labels - self.print( - f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect - self.process_flow() + processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we delete icmp/arp/etc - # so the dataframe can be empty - if self.flow is not None and not self.flow.empty: + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: # Predict - pred = self.detect() - label = self.flow_dict["label"] + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return - # Report + label = self.flow["label"] if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, and the label - # is diff from the prediction, print in debug mode + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode self.print( - f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow( - self.flow_dict["saddr"], - self.flow_dict["sport"], - self.flow_dict["daddr"], - self.flow_dict["dport"], - twid, - uid, - ) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 2, ) From 57e144cc7fe5f3dda58e0db65af60bd23cac5aa2 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 396/455] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 1cfbaf925d..0bfaef283e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From 5c562206d67d1e98ff72f75af90a2c27685724c5 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 397/455] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0bfaef283e..df1572fa52 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From a8c11a868b4bc7d5919344c8211c6bfac164c343 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 398/455] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1124 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 130 zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KL^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8 z4q_05T(v)g91VHfmFeIMvRKFpJJ~89v lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl# diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch delta 290 zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGcfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXo Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 399/455] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 400/455] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index df1572fa52..a9b8a13585 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "endtime", "flow_source", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 9682f8c59aaa9a372f73447a7579c1ee2bfc478c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 401/455] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 402/455] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 237b6ef13aca3eddca3de9b5cf8f255260238bb6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 403/455] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 43aae2e88f823e4a3d5e751b02b521d5487d231e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:49:23 +0000 Subject: [PATCH 404/455] Add plot for flowml train scores --- modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py new file mode 100644 index 0000000000..0b5b5b72ba --- /dev/null +++ b/modules/flowmldetection/plot_train_score.py @@ -0,0 +1,56 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re +import sys + +def plot_log_data(file_path): + # Read the log data from the file + with open(file_path, 'r') as file: + log_data = file.read() + + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + + # Parse the log file + data = re.findall(pattern, log_data) + + # Convert data to a DataFrame + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + df = df.astype({ + "Background": int, + "Benign": int, + "Malicious": int, + "Total labels": float, + "Score": float + }) + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plotting Score on the left y-axis + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + ax1.set_xlabel('Index') + ax1.set_ylabel('Score', color='tab:blue') + ax1.tick_params(axis='y', labelcolor='tab:blue') + + # Create the second y-axis for the Total labels + ax2 = ax1.twinx() + ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') + ax2.set_ylabel('Total labels', color='tab:red') + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Adding title and legend + plt.title('Log Data Visualization') + fig.tight_layout() + + # Save plot to a PNG file + plt.savefig('log_data_plot_with_two_scales.png') + + # Display the plot + plt.show() + +# Make sure the file path is passed as an argument +if len(sys.argv) < 2: + print("Please provide the path to the log file as a parameter.") +else: + plot_log_data(sys.argv[1]) From 6f045c72b8ac57f7b866f8cd14b0fe98fc668a9c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:04 +0000 Subject: [PATCH 405/455] Add a log file to store the training data output --- modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index a9b8a13585..8a319cb4e2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -68,12 +68,29 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves self.label = conf.label() - def train(self): + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): """ Train a model based on the flows we receive and the labels """ From 8a42f14ad61b5230c8426dbfef1f8bc0bd839a0b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:32 +0000 Subject: [PATCH 406/455] Store data in the log file of training --- modules/flowmldetection/flowmldetection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 8a319cb4e2..28e8e7eca8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -137,9 +137,13 @@ def train(self, sum_labeled_flows): # Store the models on disk self.store_model() + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ From f4dd77bff3cdb4428269ab005fb0c4b451efc9f8 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:53 +0000 Subject: [PATCH 407/455] better comments --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 28e8e7eca8..676907a6df 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -59,10 +59,9 @@ def init(self): self.minimum_labels_to_start_train = 50 # Minum amount of new labels needed to retrain self.minimum_labels_to_retrain = 50 - # The number of flows when last trained + # The number of flows when last trained. Used internally only to know + # when to retrain self.last_number_of_flows_when_trained = 0 - # To plot the scores of training - # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" From 7e72af1c156068ff3e4b91217d53830c9a4f6262 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:30 +0000 Subject: [PATCH 408/455] Fix issue not dropping detailed labels --- modules/flowmldetection/flowmldetection.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 676907a6df..483c6a1d69 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -94,23 +94,19 @@ def train(self, sum_labeled_flows): Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Train try: self.clf.partial_fit( From beaf213d6167832d8c3f1e98eb6bc98d2e40d29d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:53 +0000 Subject: [PATCH 409/455] Fix issue that not all labels sere given to the partial fit --- modules/flowmldetection/flowmldetection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 483c6a1d69..b06c9a54e3 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -109,8 +109,9 @@ def train(self, sum_labeled_flows): # Train try: + # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") From 5b290a7fc764e26766d3519bbafe54b43cdae603 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:52:08 +0000 Subject: [PATCH 410/455] count partial labels in this epoch --- modules/flowmldetection/flowmldetection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b06c9a54e3..184a6b3455 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -106,6 +106,12 @@ def train(self, sum_labeled_flows): # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } # Train try: From 1cb44821b4885c0a648bf5183dfdde83c4d71cc8 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:09 +0000 Subject: [PATCH 411/455] Dont print training in screen --- modules/flowmldetection/flowmldetection.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 184a6b3455..4dd8191f87 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -126,15 +126,8 @@ def train(self, sum_labeled_flows): # See score so far in training score = self.clf.score(X_flow, y_flow) - # To debug the training score - # self.scores.append(score) - - self.print(f" Training Score: {score}", 0, 1) - # self.print(f' Model Parameters: {self.clf.coef_}') - - # Debug code to store a plot in a png of the scores - # plt.plot(self.scores) - # plt.savefig('train-scores.png') + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) # Store the models on disk self.store_model() From a38524eada2e31b202392335cf470a1b08bbd25f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:28 +0000 Subject: [PATCH 412/455] Add function to write to train log --- modules/flowmldetection/flowmldetection.py | 34 ++++++++++++---------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4dd8191f87..679e7c0cc9 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -247,28 +247,28 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_training_flows(self, last_number_of_flows_when_trained): """ - Process all the flows in the DB + Process only the new flows in the DB since the last training. Store the pandas df in self.flows """ try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + # We get all the flows so far - # because this retraining happens in batches flows = self.db.get_all_flows() - # Check how many different labels are in the DB - # We need both normal and malware + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB labels = self.db.get_labels() if len(labels) == 1: - # Only 1 label has flows - # There are not enough different labels, so insert two flows - # that are fake but representative of a normal and malware flow - # they are only for the training process - # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. - flows.append( + # Insert fake flows for both classes if needed + new_flows.append( { "starttime": 1594417039.029793, "dur": "1.9424750804901123", @@ -358,6 +358,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", + "ground_truth_label", # todo now we can use them + "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error @@ -502,11 +504,11 @@ def main(self): labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - # The min labels to retrain is the min number of flows + # The min labels to retrain is the min number of flows # we should have seen so far in this capture to start training # This is so we dont _start_ training with only 1 flow - # Once we are over the start minimum, the second condition is + # Once we are over the start minimum, the second condition is # to force to retrain every a minimum_labels_to_retrain number # of flows. So we dont retrain every 1 flow. if ( From 9a888b7055b804316775159042255e84a191869c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:27 +0000 Subject: [PATCH 413/455] Fix label in dummy flow --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 679e7c0cc9..95c9b82a74 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "ground_truth_label": "Malicious", "module_labels": { "flowalerts-long-connection": "Malicious" }, } ) - flows.append( + new_flows.append( { "starttime": 1382355032.706468, "dur": "10.896695", From 8f8a5443834244a4522f80ef17cdb073d3976bc4 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:39 +0000 Subject: [PATCH 414/455] Fix dummy flow --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 95c9b82a74..5ea48fbc40 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "label": "Benign", + "ground_truth_label": "Benign", "module_labels": { "flowalerts-long-connection": "Benign" }, From d27350f5678356eda2dfdea7722c4a2567a3a93f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:58:28 +0000 Subject: [PATCH 415/455] Rename variable --- modules/flowmldetection/flowmldetection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5ea48fbc40..ff68b8a270 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): }, } ) - # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(flows) + df_flows = pd.DataFrame(new_flows) # Process features df_flows = self.process_features(df_flows) @@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: - # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) From 4242689cf0a9b71ba877668080c5f7907d944d45 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:32 +0000 Subject: [PATCH 416/455] Fix dummy flow label --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index ff68b8a270..6b41b40298 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -356,7 +356,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", - "ground_truth_label", # todo now we can use them + "ground_truth_label", "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. From 6d561e03770607761204e82b027fc8f167c0887e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:47 +0000 Subject: [PATCH 417/455] Pass values to train function --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6b41b40298..4d66aab855 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -521,9 +521,9 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows() + self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train() + self.train(sum_labeled_flows) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From 50d892127da4c1bbaf150997363c3cc9b1d41f9a Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:01:47 +0000 Subject: [PATCH 418/455] import os --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4d66aab855..766178e127 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import os from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From a7cf82be948b4ff673f189d62d89276b1b385471 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:53 +0000 Subject: [PATCH 419/455] Delete old comments --- modules/flowmldetection/flowmldetection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 766178e127..6c3bfc1275 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -28,10 +28,6 @@ Method, ) -# Only for debbuging -# from matplotlib import pyplot as plt - - # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass From 06add4106a0c833a368dad445a094a0a76f11f3d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:13:22 +0000 Subject: [PATCH 420/455] Fix plots --- modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 0b5b5b72ba..359df04eff 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -2,6 +2,8 @@ import matplotlib.pyplot as plt import re import sys +import argparse +import os def plot_log_data(file_path): # Read the log data from the file @@ -24,33 +26,59 @@ def plot_log_data(file_path): "Score": float }) + # Get the directory of the log file to store the plot in the same folder + dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Total labels + # Create the second y-axis for the Background, Benign, Malicious, Total labels ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() - # Save plot to a PNG file - plt.savefig('log_data_plot_with_two_scales.png') + # Adding the legend with increased space for readability + ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') + ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.75) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) # Display the plot plt.show() -# Make sure the file path is passed as an argument -if len(sys.argv) < 2: - print("Please provide the path to the log file as a parameter.") -else: - plot_log_data(sys.argv[1]) +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help + args = parser.parse_args() + + # Call the function to process the log file + plot_log_data(args.log_file) + +if __name__ == "__main__": + main() From f5160524451637eb0ad20db0b277395d0683f368 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:14:58 +0000 Subject: [PATCH 421/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 359df04eff..c7f374a7fe 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -40,18 +40,21 @@ def plot_log_data(file_path): ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Background, Benign, Malicious, Total labels + # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() From d1b2bd882e7718d8923436b5485fe0e5398b4383 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:16:23 +0000 Subject: [PATCH 422/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index c7f374a7fe..4099c47c1e 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -42,10 +42,10 @@ def plot_log_data(file_path): # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') # Set appropriate scale for right y-axis based on the data ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) @@ -56,7 +56,7 @@ def plot_log_data(file_path): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') # Adding title and legend - plt.title('Log Data Visualization') + plt.title('Training performance') fig.tight_layout() # Adding the legend with increased space for readability From ba0e9f1a8cc05c044b76810c1e9fa164492732a5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:24:43 +0000 Subject: [PATCH 423/455] Fix plot --- modules/flowmldetection/plot_train_score.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 4099c47c1e..8437e968ac 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -59,12 +59,12 @@ def plot_log_data(file_path): plt.title('Training performance') fig.tight_layout() - # Adding the legend with increased space for readability - ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') - ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) + plt.subplots_adjust(right=0.7) # Save plot to the same folder as the log file plt.savefig(plot_file) From e089bec8ae86ab1fb938a03b08430b6eace488e2 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:02:34 +0000 Subject: [PATCH 424/455] Plot testing performance from a log --- .../plot_testing_performance.py | 116 ++++-------------- 1 file changed, 24 insertions(+), 92 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 6865415cdf..a38c7f0598 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,7 +1,6 @@ import matplotlib.pyplot as plt import sys import numpy as np -import argparse def process_file(file_path): # Initialize the counters for the values @@ -50,108 +49,41 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) - - # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) - - # Print the final values - print("\nFinal Metric Values for Experiment", experiment_number) - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Create the plot plt.figure(figsize=(12, 8)) - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # If the plot is close to 1, apply log scale - if not is_close_to_0: - plt.yscale('log') - - # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series - if is_close_to_0: - min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) - max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) - - # Avoid log(0), so set the minimum limit a little higher than zero - if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale - - plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically - - # Add the experiment number to the plot title + # Plot each metric + plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') + plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') + plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') + plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') + plt.plot(F1_values, label='F1 Score', marker='o') + plt.plot(accuracy_values, label='Accuracy', marker='o') + plt.plot(precision_values, label='Precision', marker='o') + plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') + plt.plot(recall_values, label='Recall (TPR)', marker='o') + + # Add labels and title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') + plt.title('Evaluation Metrics Over Time') + + # Add a legend plt.legend() - # Save the plot - plt.savefig(output_filename) + # Save the plot as a PNG file + plt.savefig('metrics_plot.png') plt.close() def main(): - # Set up argument parsing - parser = argparse.ArgumentParser(description='Plot testing performance metrics.') - parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') - parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') - - args = parser.parse_args() - - file_path = args.file - experiment_number = args.experiment + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + file_path = sys.argv[1] FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) if __name__ == "__main__": main() From 499f08bdbda9d16604b33df6e0b60c54cdec709d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:04:32 +0000 Subject: [PATCH 425/455] Fix the plot --- modules/flowmldetection/plot_testing_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index a38c7f0598..fac0acd64a 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Set logarithmic scale on the y-axis + plt.yscale('log') + # Add labels and title plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title('Evaluation Metrics Over Time') + plt.ylabel('Metric Value (Log Scale)') + plt.title('Evaluation Metrics Over Time (Log Scale)') # Add a legend plt.legend() # Save the plot as a PNG file - plt.savefig('metrics_plot.png') + plt.savefig('metrics_plot_log_scale.png') plt.close() def main(): From 9007dfbdaccdaaa852e6c1e30e93746fb6052478 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:12:40 +0000 Subject: [PATCH 426/455] Fix the plots --- .../plot_testing_performance.py | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index fac0acd64a..5581c72cd4 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -50,33 +50,66 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Create the plot - plt.figure(figsize=(12, 8)) + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } - # Plot each metric - plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') - plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') - plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') - plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') - plt.plot(F1_values, label='F1 Score', marker='o') - plt.plot(accuracy_values, label='Accuracy', marker='o') - plt.plot(precision_values, label='Precision', marker='o') - plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') - plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') - # Set logarithmic scale on the y-axis - plt.yscale('log') + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + +def plot_single_group(metrics_dict, output_filename): + plt.figure(figsize=(12, 8)) - # Add labels and title + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + plt.xlabel('Index') - plt.ylabel('Metric Value (Log Scale)') - plt.title('Evaluation Metrics Over Time (Log Scale)') - - # Add a legend + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') plt.legend() - # Save the plot as a PNG file - plt.savefig('metrics_plot_log_scale.png') + # Save the plot + plt.savefig(output_filename) plt.close() def main(): @@ -85,6 +118,7 @@ def main(): sys.exit(1) file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) From fb2e163811d92a22203ad14e5462c74c8514c6cf Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:16:50 +0000 Subject: [PATCH 427/455] Fix plot --- .../plot_testing_performance.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 5581c72cd4..8f9e12cd86 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') -def plot_single_group(metrics_dict, output_filename): + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename): # Apply log scale by default plt.yscale('log') + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Manually set more Y-ticks for better visibility + plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 + plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + plt.xlabel('Index') plt.ylabel('Metric Value') plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') From acac48b8feccf08958d19f68d0375bb4bb7e6df1 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:22 +0000 Subject: [PATCH 428/455] Fix plots --- modules/flowmldetection/flowmldetection.py | 709 +++++---------------- 1 file changed, 143 insertions(+), 566 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6c3bfc1275..37f0761109 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,566 +1,143 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy -from sklearn.linear_model import SGDClassifier -from sklearn.preprocessing import StandardScaler -import pickle -import pandas as pd -import json -import traceback -import warnings -import os - -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( - Evidence, - ProfileID, - TimeWindow, - Attacker, - ThreatLevel, - EvidenceType, - IoCType, - Direction, - Victim, - Method, -) - -# This horrible hack is only to stop sklearn from printing those warnings -def warn(*args, **kwargs): - pass - - -warnings.warn = warn - - -class FlowMLDetection(IModule): - # Name: short name of the module. Do not use spaces - name = "Flow ML Detection" - description = ( - "Train or test a Machine Learning model to detect malicious flows" - ) - authors = ["Sebastian Garcia"] - - def init(self): - # Subscribe to the channel - self.c1 = self.db.subscribe("new_flow") - self.channels = {"new_flow": self.c1} - self.fieldseparator = self.db.get_field_separator() - # Set the output queue of our database instance - # Read the configuration - self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 - # The scaler trained during training and to use during testing - self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - - def read_configuration(self): - conf = ConfigParser() - self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - - def train(self, sum_labeled_flows): - """ - Train a model based on the flows we receive and the labels - """ - try: - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels - X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - - # Normalize this batch of data so far. This can get progressivle slow - X_flow = self.scaler.fit_transform(X_flow) - - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - - # Train - try: - # Online incremental learning - self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] - ) - except Exception: - self.print("Error while calling clf.train()") - self.print(traceback.format_exc(), 0, 1) - - # See score so far in training - score = self.clf.score(X_flow, y_flow) - - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) - - # Store the models on disk - self.store_model() - - # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") - except Exception: - self.print("Error in train().", 0, 1) - self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") - - def process_features(self, dataset): - """ - Discards some features of the dataset and can create new. - Clean the dataset - """ - try: - # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] - for proto in to_discard: - dataset = dataset[dataset.proto != proto] - - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these - to_drop = [ - "appproto", - "daddr", - "saddr", - "starttime", - "type_", - "smac", - "dmac", - "history", - "uid", - "dir_", - "endtime", - "flow_source", - ] - for field in to_drop: - try: - dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): - pass - - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others - # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) - - # Convert state to categorical - dataset.state = dataset.state.str.replace( - r"(^.*Not Established.*$)", "0", regex=True - ) - dataset.state = dataset.state.str.replace( - r"(^.*Established.*$)", "1", regex=True - ) - - # Convert categories to floats - dataset.state = dataset.state.astype("float64") - - # Convert proto to categorical. For now we only have few states, so we can hardcode... - # We dont use the data to create categories because in testing mode - # we dont see all the protocols - # Also we dont store the Categorizer because the user can retrain - # with its own data. - dataset.proto = dataset.proto.str.lower() - dataset.proto = dataset.proto.str.replace( - r"(^.*tcp.*$)", "0", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*udp.*$)", "1", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp.*$)", "2", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp-ipv6.*$)", "3", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*arp.*$)", "4", regex=True - ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_float: - try: - field = field.astype("float64") - except (ValueError, AttributeError): - pass - - return dataset - except Exception: - # Stop the timer - self.print("Error in process_features()") - self.print(traceback.format_exc(), 0, 1) - - def process_training_flows(self, last_number_of_flows_when_trained): - """ - Process only the new flows in the DB since the last training. - Store the pandas df in self.flows - """ - try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - - # We get all the flows so far - flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB - labels = self.db.get_labels() - if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( - { - "starttime": 1594417039.029793, - "dur": "1.9424750804901123", - "saddr": "10.7.10.101", - "sport": "49733", - "daddr": "40.70.224.145", - "dport": "443", - "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, - "sbytes": 25517, - "dbytes": 17247, - "appproto": "ssl", - "ground_truth_label": "Malicious", - "module_labels": { - "flowalerts-long-connection": "Malicious" - }, - } - ) - new_flows.append( - { - "starttime": 1382355032.706468, - "dur": "10.896695", - "saddr": "147.32.83.52", - "sport": "47956", - "daddr": "80.242.138.72", - "dport": "80", - "proto": "tcp", - "state": "SF", - "spkts": 1, - "dpkts": 0, - "sbytes": 100, - "dbytes": 67596, - "appproto": "http", - "ground_truth_label": "Benign", - "module_labels": { - "flowalerts-long-connection": "Benign" - }, - } - ) - - # Convert to pandas df - df_flows = pd.DataFrame(new_flows) - - # Process features - df_flows = self.process_features(df_flows) - - # Update the flow to the processed version - self.flows = df_flows - except Exception: - self.print("Error in process_flows()") - self.print(traceback.format_exc(), 0, 1) - - def process_flow(self, flow_to_process: dict): - """ - Process one flow. Only used during detection in testing - returns the pandas df with the processed flow - """ - try: - # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) - dflow = self.process_features(raw_flow) - if dflow.empty: - return None - # Update the flow to the processed version - return dflow - except Exception: - # Stop the timer - self.print("Error in process_flow()") - self.print(traceback.format_exc(), 0, 1) - - def detect(self, x_flow) -> Optional[numpy.ndarray]: - """ - Detects the given flow with the current model stored - and returns the predection array - """ - try: - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "endtime", - "flow_source", - "ground_truth_label", - "detailed_ground_truth_label", - ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass - # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) - return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" - ) - self.print(traceback.format_exc(), 0, 1) - - def store_model(self): - """ - Store the trained model on disk - """ - self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: - data = pickle.dumps(self.clf) - f.write(data) - with open(self.scaler_path, "wb") as g: - data = pickle.dumps(self.scaler) - g.write(data) - - def read_model(self): - """ - Read the trained model from disk - """ - try: - self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: - self.clf = pickle.load(f) - self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: - self.scaler = pickle.load(g) - except FileNotFoundError: - # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - except EOFError: - self.print( - "Error reading model from disk. " - "Creating a new empty model.", - 0, - 2, - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - - def set_evidence_malicious_flow(self, flow: dict, twid: str): - confidence: float = 0.1 - description = ( - f"Flow with malicious characteristics by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" - ) - twid_number = int(twid.replace("timewindow", "")) - evidence: Evidence = Evidence( - evidence_type=EvidenceType.MALICIOUS_FLOW, - attacker=Attacker( - direction=Direction.SRC, - ioc_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - ioc_type=IoCType.IP, - value=flow["daddr"], - ), - threat_level=ThreatLevel.LOW, - confidence=confidence, - description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], - timestamp=flow["starttime"], - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], - ) - - self.db.set_evidence(evidence) - - def shutdown_gracefully(self): - # Confirm that the module is done processing - if self.mode == "train": - self.store_model() - - def pre_main(self): - utils.drop_root_privs() - # Load the model - self.read_model() - - def main(self): - if msg := self.get_msg("new_flow"): - # When a new flow arrives - msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] - self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "state": msg["interpreted_state"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) - - if self.mode == "train": - # We are training - - # Is the amount in the DB of labels enough to retrain? - # Use labeled flows - labels = self.db.get_labels() - sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows) - self.last_number_of_flows_when_trained = sum_labeled_flows - - elif self.mode == "test": - # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) - - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: - # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return - - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) - if pred[0] == "Malicious": - # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) - self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 2, - ) +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } + + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): + plt.figure(figsize=(12, 8)) + + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Add more ticks between 0 and 1 (using a logarithmic scale) + plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) + + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.legend() + + # Save the plot + plt.savefig(output_filename) + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 41961660beaf2d95a10273bdebceae4388fafd95 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:52 +0000 Subject: [PATCH 429/455] Fix plots --- .../plot_testing_performance.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 8f9e12cd86..69b8c96a8c 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['MCC'].append(MCC_values[i]) close_to_1['recall'].append(recall_values[i]) - # Plot metrics for values close to 0 + # Plot metrics for values close to 0 (linear scale) plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + # Plot metrics for values close to 1 (log scale) + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): if 'recall' in metrics_dict: plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - # Apply log scale by default - plt.yscale('log') + # If the plot is close to 1, apply log scale + if not is_close_to_0: + plt.yscale('log') - # If the plot is close to 0, set custom ticks + # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series if is_close_to_0: - # Manually set more Y-ticks for better visibility - plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 - plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) + max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) + + # Avoid log(0), so set the minimum limit a little higher than zero + if min_val == 0: + min_val = 1e-4 # Avoid zero values on the logarithmic scale + + plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From dcd73e24811c9ebd2e4aadfea719b851736d72ab Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:23:02 +0000 Subject: [PATCH 430/455] Fix plots --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 69b8c96a8c..de4ada38b3 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale + min_val = 1e-8 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 499fe19c08b34469a0f7826d614ceababc9d0849 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:25:58 +0000 Subject: [PATCH 431/455] Change plot names --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index de4ada38b3..1b4152c6eb 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") From 8735210db117c14006ef382bf21051b90cd6c01c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:26:09 +0000 Subject: [PATCH 432/455] Rename file --- .../flowmldetection/plot_train_performance.py | 130 +++++++----------- modules/flowmldetection/plot_train_score.py | 87 ------------ 2 files changed, 53 insertions(+), 164 deletions(-) delete mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 304f0f4ead..80e13e9515 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,108 +4,84 @@ import sys import argparse import os -import matplotlib.ticker as ticker -def plot_log_data(file_path, experiment_number): +def plot_log_data(file_path): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Regex pattern for the new log format - pattern = ( - r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " - r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " - r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." - ) + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - columns = [ - "Total labels", "Background", "Benign", "Malicious", - "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" - ] - df = pd.DataFrame(data, columns=columns) + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) df = df.astype({ - "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "FPR": float, - "TNR": float, - "TPR": float, - "FNR": float, - "F1": float, - "Precision": float, - "Accuracy": float, - "MCC": float, - "Recall": float, + "Total labels": float, + "Score": float }) + # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) - # --- Plot 1: Number of labels (linear scale, no total labels) --- - fig1, ax1 = plt.subplots(figsize=(10, 6)) - ax1.plot(df.index, df["Background"], label="Background", color='black') - ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') - ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') - ax1.set_ylabel('Label Counts') - ax1.set_title(f'Label Counts - Experiment {experiment_number}') - ax1.legend() - ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) - ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) - plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) - - # --- Plot 2: FNR and FPR (log scale) --- - fig2, ax2 = plt.subplots(figsize=(10, 6)) - ax2.plot(df.index, df["FNR"], label="FNR", color='red') - ax2.plot(df.index, df["FPR"], label="FPR", color='blue') - ax2.set_xlabel('Index') - ax2.set_ylabel('Rate') - ax2.set_yscale('log') - ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') - ax2.legend() - ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) - ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) - plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) - - # --- Plot 3: Other metrics (log scale) --- - fig3, ax3 = plt.subplots(figsize=(12, 7)) - metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] - colors_rest = [ - 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', - 'tab:gray', 'tab:pink', 'tab:olive' - ] - for metric, color in zip(metrics_rest, colors_rest): - ax3.plot(df.index, df[metric], label=metric, color=color) - ax3.set_xlabel('Index') - ax3.set_ylabel('Metric Value') - ax3.set_yscale('log') - ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') - ax3.legend() - ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) - ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) - plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) + ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 + ax1.tick_params(axis='y', labelcolor='tab:blue') - plt.show() + # Create the second y-axis for the Background, Benign, Malicious + ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + + # Adding title and legend + plt.title('Training performance') + fig.tight_layout() - # --- Print final values in terminal --- - print("\nFinal values at last training step:") - for col in ["Total labels", "Background", "Benign", "Malicious", - "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: - print(f"{col}: {df[col].iloc[-1]}") + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.7) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) + + # Display the plot + plt.show() def main(): + # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") - parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help args = parser.parse_args() - plot_log_data(args.file, args.experiment) + + # Call the function to process the log file + plot_log_data(args.log_file) if __name__ == "__main__": main() diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py deleted file mode 100644 index 8437e968ac..0000000000 --- a/modules/flowmldetection/plot_train_score.py +++ /dev/null @@ -1,87 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt -import re -import sys -import argparse -import os - -def plot_log_data(file_path): - # Read the log data from the file - with open(file_path, 'r') as file: - log_data = file.read() - - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" - - # Parse the log file - data = re.findall(pattern, log_data) - - # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) - df = df.astype({ - "Background": int, - "Benign": int, - "Malicious": int, - "Total labels": float, - "Score": float - }) - - # Get the directory of the log file to store the plot in the same folder - dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') - ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend - plt.title('Training performance') - fig.tight_layout() - - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) - - # Save plot to the same folder as the log file - plt.savefig(plot_file) - - # Display the plot - plt.show() - -def main(): - # Parse command-line arguments - parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") - - # Handle -h / --help - args = parser.parse_args() - - # Call the function to process the log file - plot_log_data(args.log_file) - -if __name__ == "__main__": - main() From a454bd7b3fca49d80a02d05783b2637b57101d9c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:32 +0000 Subject: [PATCH 433/455] Recover good flowmldetection deleted by mistake --- modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++----- 1 file changed, 566 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 37f0761109..5e4e9aa462 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,143 +1,566 @@ -import matplotlib.pyplot as plt -import sys -import numpy as np - -def process_file(file_path): - # Initialize the counters for the values - FPR_values = [] - FNR_values = [] - TNR_values = [] - TPR_values = [] - F1_values = [] - accuracy_values = [] - precision_values = [] - MCC_values = [] - recall_values = [] - - # Read the file and extract the data - with open(file_path, 'r') as file: - for line in file: - if "TP:" in line: - # Extract the values from the line - parts = line.split(',') - TP = int(parts[0].split(':')[1].strip()) - TN = int(parts[1].split(':')[1].strip()) - FP = int(parts[2].split(':')[1].strip()) - FN = int(parts[3].split(':')[1].strip()) - - # Calculate metrics - FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 - FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 - TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 - TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 - Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 - Recall = TPR # Recall is the same as TPR - F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 - Accuracy = (TP + TN) / (TP + TN + FP + FN) - MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 - - # Append the values to the respective lists - FPR_values.append(FPR) - FNR_values.append(FNR) - TNR_values.append(TNR) - TPR_values.append(TPR) - F1_values.append(F1) - accuracy_values.append(Accuracy) - precision_values.append(Precision) - MCC_values.append(MCC) - recall_values.append(Recall) - - return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values - -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') - - # Print the final values - print("\nFinal Metric Values:") - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): - plt.figure(figsize=(12, 8)) - - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # Apply log scale by default - plt.yscale('log') - - # If the plot is close to 0, set custom ticks - if is_close_to_0: - # Add more ticks between 0 and 1 (using a logarithmic scale) - plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) - - plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') - plt.legend() - - # Save the plot - plt.savefig(output_filename) - plt.close() - -def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - file_path = sys.argv[1] - - FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) - -if __name__ == "__main__": - main() +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler +import pickle +import pandas as pd +import json +import traceback +import warnings +import os + +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.evidence import ( + Evidence, + ProfileID, + TimeWindow, + Attacker, + ThreatLevel, + EvidenceType, + IoCType, + Direction, + Victim, + Method, +) + +# This horrible hack is only to stop sklearn from printing those warnings +def warn(*args, **kwargs): + pass + + +warnings.warn = warn + + +class FlowMLDetection(IModule): + # Name: short name of the module. Do not use spaces + name = "Flow ML Detection" + description = ( + "Train or test a Machine Learning model to detect malicious flows" + ) + authors = ["Sebastian Garcia"] + + def init(self): + # Subscribe to the channel + self.c1 = self.db.subscribe("new_flow") + self.channels = {"new_flow": self.c1} + self.fieldseparator = self.db.get_field_separator() + # Set the output queue of our database instance + # Read the configuration + self.read_configuration() + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained. Used internally only to know + # when to retrain + self.last_number_of_flows_when_trained = 0 + # The scaler trained during training and to use during testing + self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" + + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + + def read_configuration(self): + conf = ConfigParser() + self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves + self.label = conf.label() + + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): + """ + Train a model based on the flows we receive and the labels + """ + try: + # Create X_flow with the current flows minus the label + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) + # Drop the module_labels + X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) + + # Normalize this batch of data so far. This can get progressivle slow + X_flow = self.scaler.fit_transform(X_flow) + + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } + + # Train + try: + # Online incremental learning + self.clf.partial_fit( + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + ) + except Exception: + self.print("Error while calling clf.train()") + self.print(traceback.format_exc(), 0, 1) + + # See score so far in training + score = self.clf.score(X_flow, y_flow) + + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + + # Store the models on disk + self.store_model() + + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + except Exception: + self.print("Error in train().", 0, 1) + self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") + + def process_features(self, dataset): + """ + Discards some features of the dataset and can create new. + Clean the dataset + """ + try: + # Discard some type of flows that dont have ports + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + for proto in to_discard: + dataset = dataset[dataset.proto != proto] + + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + + # For now, discard these + to_drop = [ + "appproto", + "daddr", + "saddr", + "starttime", + "type_", + "smac", + "dmac", + "history", + "uid", + "dir_", + "endtime", + "flow_source", + ] + for field in to_drop: + try: + dataset = dataset.drop(field, axis=1) + except (ValueError, KeyError): + pass + + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others + # So transform here + dataset["state"] = dataset.apply( + lambda row: self.db.get_final_state_from_flags( + row["state"], (row["spkts"] + row["dpkts"]) + ), + axis=1, + ) + + # Convert state to categorical + dataset.state = dataset.state.str.replace( + r"(^.*Not Established.*$)", "0", regex=True + ) + dataset.state = dataset.state.str.replace( + r"(^.*Established.*$)", "1", regex=True + ) + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... + # We dont use the data to create categories because in testing mode + # we dont see all the protocols + # Also we dont store the Categorizer because the user can retrain + # with its own data. + dataset.proto = dataset.proto.str.lower() + dataset.proto = dataset.proto.str.replace( + r"(^.*tcp.*$)", "0", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*udp.*$)", "1", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp.*$)", "2", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp-ipv6.*$)", "3", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*arp.*$)", "4", regex=True + ) + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + + return dataset + except Exception: + # Stop the timer + self.print("Error in process_features()") + self.print(traceback.format_exc(), 0, 1) + + def process_training_flows(self, last_number_of_flows_when_trained): + """ + Process only the new flows in the DB since the last training. + Store the pandas df in self.flows + """ + try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + + # We get all the flows so far + flows = self.db.get_all_flows() + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB + labels = self.db.get_labels() + if len(labels) == 1: + # Insert fake flows for both classes if needed + new_flows.append( + { + "starttime": 1594417039.029793, + "dur": "1.9424750804901123", + "saddr": "10.7.10.101", + "sport": "49733", + "daddr": "40.70.224.145", + "dport": "443", + "proto": "tcp", + "state": "SF", + "spkts": 17, + "dpkts": 27, + "sbytes": 25517, + "dbytes": 17247, + "appproto": "ssl", + "ground_truth_label": "Malicious", + "module_labels": { + "flowalerts-long-connection": "Malicious" + }, + } + ) + new_flows.append( + { + "starttime": 1382355032.706468, + "dur": "10.896695", + "saddr": "147.32.83.52", + "sport": "47956", + "daddr": "80.242.138.72", + "dport": "80", + "proto": "tcp", + "state": "SF", + "spkts": 1, + "dpkts": 0, + "sbytes": 100, + "dbytes": 67596, + "appproto": "http", + "ground_truth_label": "Benign", + "module_labels": { + "flowalerts-long-connection": "Benign" + }, + } + ) + + # Convert to pandas df + df_flows = pd.DataFrame(new_flows) + + # Process features + df_flows = self.process_features(df_flows) + + # Update the flow to the processed version + self.flows = df_flows + except Exception: + self.print("Error in process_flows()") + self.print(traceback.format_exc(), 0, 1) + + def process_flow(self, flow_to_process: dict): + """ + Process one flow. Only used during detection in testing + returns the pandas df with the processed flow + """ + try: + # Convert the flow to a pandas dataframe + raw_flow = pd.DataFrame(flow_to_process, index=[0]) + dflow = self.process_features(raw_flow) + if dflow.empty: + return None + # Update the flow to the processed version + return dflow + except Exception: + # Stop the timer + self.print("Error in process_flow()") + self.print(traceback.format_exc(), 0, 1) + + def detect(self, x_flow) -> Optional[numpy.ndarray]: + """ + Detects the given flow with the current model stored + and returns the predection array + """ + try: + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", + ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass + # Scale the flow + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) + return pred + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) + self.print(traceback.format_exc(), 0, 1) + + def store_model(self): + """ + Store the trained model on disk + """ + self.print("Storing the trained model and scaler on disk.", 0, 2) + with open(self.model_path, "wb") as f: + data = pickle.dumps(self.clf) + f.write(data) + with open(self.scaler_path, "wb") as g: + data = pickle.dumps(self.scaler) + g.write(data) + + def read_model(self): + """ + Read the trained model from disk + """ + try: + self.print("Reading the trained model from disk.", 0, 2) + with open(self.model_path, "rb") as f: + self.clf = pickle.load(f) + self.print("Reading the trained scaler from disk.", 0, 2) + with open(self.scaler_path, "rb") as g: + self.scaler = pickle.load(g) + except FileNotFoundError: + # If there is no model, create one empty + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + except EOFError: + self.print( + "Error reading model from disk. " + "Creating a new empty model.", + 0, + 2, + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + + def set_evidence_malicious_flow(self, flow: dict, twid: str): + confidence: float = 0.1 + description = ( + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" + ) + twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( + evidence_type=EvidenceType.MALICIOUS_FLOW, + attacker=Attacker( + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], + ), + threat_level=ThreatLevel.LOW, + confidence=confidence, + description=description, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], + ) + + self.db.set_evidence(evidence) + + def shutdown_gracefully(self): + # Confirm that the module is done processing + if self.mode == "train": + self.store_model() + + def pre_main(self): + utils.drop_root_privs() + # Load the model + self.read_model() + + def main(self): + if msg := self.get_msg("new_flow"): + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) + + if self.mode == "train": + # We are training + + # Is the amount in the DB of labels enough to retrain? + # Use labeled flows + labels = self.db.get_labels() + sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. + if ( + sum_labeled_flows >= self.minimum_labels_to_start_train + ): + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows(self.last_number_of_flows_when_trained) + # Train an algorithm + self.train(sum_labeled_flows) + self.last_number_of_flows_when_trained = sum_labeled_flows + + elif self.mode == "test": + # We are testing, which means using the model to detect + processed_flow = self.process_flow(self.flow) + + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: + # Predict + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return + + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malicious": + # Generate an alert + self.set_evidence_malicious_flow(self.flow, self.twid) + self.print( + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 2, + ) \ No newline at end of file From 3da80024964515c0df1aee115d68a9c73cba1c7e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:43 +0000 Subject: [PATCH 434/455] Fix plot test --- modules/flowmldetection/plot_testing_performance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 1b4152c6eb..977a68b2d5 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-8 # Avoid zero values on the logarithmic scale + min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From d4e2666af9c2454ebbffd2dbc7f338c99bfc63a5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:50:33 +0000 Subject: [PATCH 435/455] Add testing code to evaluate performance. It is optional with a varible --- modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5e4e9aa462..b17a1baaf0 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -526,36 +526,21 @@ def main(self): elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: + original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) if pred[0] == "Malicious": # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f"Prediction {pred[0]} for label {label}" + f"Prediction {pred[0]} for label {original_label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -563,4 +548,43 @@ def main(self): f'{self.flow["proto"]}', 0, 2, - ) \ No newline at end of file + ) + + # So you can disable this code easily. Since it is used only for evaluating a testing + log_testing_data = True + if log_testing_data: + # Initialize counters if not already done + if not hasattr(self, 'tp'): + self.tp = 0 + if not hasattr(self, 'tn'): + self.tn = 0 + if not hasattr(self, 'fp'): + self.fp = 0 + if not hasattr(self, 'fn'): + self.fn = 0 + + + # Update counters based on predictions and labels + if pred[0] == "Malicious" and original_label == "Malicious": + self.tp += 1 + elif pred[0] == "Benign" and original_label == "Benign": + self.tn += 1 + elif pred[0] == "Malicious" and original_label == "Benign": + self.fp += 1 + elif pred[0] == "Benign" and original_label == "Malicious": + self.fn += 1 + + testing_log_path = "./modules/flowmldetection/testing_performance.log" + try: + with open(testing_log_path, "a") as log_file: + log_file.write("Testing Performance Log Initialized\n") + # Log the testing performance metrics + log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") + + # Log the original flow for false positives and false negatives + if pred[0] == "Malicious" and original_label == "Benign": + log_file.write(f"False Positive Flow: {self.flow}\n") + elif pred[0] == "Benign" and original_label == "Malicious": + log_file.write(f"False Negative Flow: {self.flow}\n") + except Exception as e: + self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From 5d2d84a80cf2a77f160bc5cb16a46ae9700ff9a0 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:04:00 +0000 Subject: [PATCH 436/455] Fix plots --- .../plot_testing_performance.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 977a68b2d5..6865415cdf 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import sys import numpy as np +import argparse def process_file(file_path): # Initialize the counters for the values @@ -49,7 +50,7 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): # Separate the values into two groups based on their proximity to 0 or 1 close_to_0 = { 'FPR': [], 'FNR': [] @@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) # Print the final values - print("\nFinal Metric Values:") + print("\nFinal Metric Values for Experiment", experiment_number) print(f"Final FPR: {FPR_values[-1]:.4f}") print(f"Final FNR: {FNR_values[-1]:.4f}") print(f"Final TNR: {TNR_values[-1]:.4f}") @@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu print(f"Final MCC: {MCC_values[-1]:.4f}") print(f"Final Recall: {recall_values[-1]:.4f}") -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): +def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + # Add the experiment number to the plot title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') plt.legend() # Save the plot @@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.close() def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) + # Set up argument parsing + parser = argparse.ArgumentParser(description='Plot testing performance metrics.') + parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') + parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') + + args = parser.parse_args() - file_path = sys.argv[1] + file_path = args.file + experiment_number = args.experiment FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) if __name__ == "__main__": main() From e400c0354f3c7ce82739100a48e394c026b02514 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:14:51 +0000 Subject: [PATCH 437/455] Fix train plot --- .../flowmldetection/plot_train_performance.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 80e13e9515..244df13d28 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -5,7 +5,7 @@ import argparse import os -def plot_log_data(file_path): +def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() @@ -28,7 +28,8 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + # Append experiment number to the filename + plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) @@ -55,18 +56,18 @@ def plot_log_data(file_path): for i, value in enumerate(df["Total labels"]): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - # Adding title and legend - plt.title('Training performance') + # Adding title and legend with experiment number in title + plt.title(f'Training performance - Experiment {experiment_number}') fig.tight_layout() # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) + plt.subplots_adjust(right=0.75) - # Save plot to the same folder as the log file + # Save plot to the same folder as the log file with experiment number in filename plt.savefig(plot_file) # Display the plot @@ -75,13 +76,14 @@ def plot_log_data(file_path): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") + parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") # Handle -h / --help args = parser.parse_args() # Call the function to process the log file - plot_log_data(args.log_file) + plot_log_data(args.file, args.experiment) if __name__ == "__main__": main() From 8983a7f529e987e11dc915513179f0b1620e3f64 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:14:48 +0000 Subject: [PATCH 438/455] Fix plots --- .../flowmldetection/plot_train_performance.py | 122 ++++++++++-------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 244df13d28..5212dfeeaf 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,85 +4,105 @@ import sys import argparse import os +import matplotlib.ticker as ticker def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + # Regex pattern for the new log format + pattern = ( + r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " + r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " + r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." + ) # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + columns = [ + "Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" + ] + df = pd.DataFrame(data, columns=columns) df = df.astype({ + "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "Total labels": float, - "Score": float + "FPR": float, + "TNR": float, + "TPR": float, + "FNR": float, + "F1": float, + "Precision": float, + "Accuracy": float, + "MCC": float, + "Recall": float, }) - # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - # Append experiment number to the filename - plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + # --- Plot 1: Number of labels (linear scale, no total labels) --- + fig1, ax1 = plt.subplots(figsize=(10, 6)) + ax1.plot(df.index, df["Background"], label="Background", color='black') + ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') + ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend with experiment number in title - plt.title(f'Training performance - Experiment {experiment_number}') - fig.tight_layout() + ax1.set_ylabel('Label Counts') + # No log scale here + ax1.set_title(f'Label Counts - Experiment {experiment_number}') + ax1.legend() + ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + + # --- Plot 2: FNR and FPR (log scale) --- + fig2, ax2 = plt.subplots(figsize=(10, 6)) + ax2.plot(df.index, df["FNR"], label="FNR", color='red') + ax2.plot(df.index, df["FPR"], label="FPR", color='blue') + ax2.set_xlabel('Index') + ax2.set_ylabel('Rate') + ax2.set_yscale('log') + ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') + ax2.legend() + ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + + # --- Plot 3: Other metrics (log scale) --- + fig3, ax3 = plt.subplots(figsize=(12, 7)) + metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] + colors_rest = [ + 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', + 'tab:gray', 'tab:pink', 'tab:olive' + ] + for metric, color in zip(metrics_rest, colors_rest): + ax3.plot(df.index, df[metric], label=metric, color=color) + ax3.set_xlabel('Index') + ax3.set_ylabel('Metric Value') + ax3.set_yscale('log') + ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') + ax3.legend() + ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) - - # Save plot to the same folder as the log file with experiment number in filename - plt.savefig(plot_file) - - # Display the plot plt.show() + # --- Print final values in terminal --- + print("\nFinal values at last training step:") + for col in ["Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: + print(f"{col}: {df[col].iloc[-1]}") + def main(): - # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") - - # Handle -h / --help args = parser.parse_args() - - # Call the function to process the log file plot_log_data(args.file, args.experiment) if __name__ == "__main__": From 4cca7685112dc012940248c7e647a56806fb5b83 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:16:01 +0000 Subject: [PATCH 439/455] Add performance metrics to the training evaluation --- modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++----- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b17a1baaf0..2c60cd4034 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,16 @@ import json import traceback import warnings -import os +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import ( + confusion_matrix, + f1_score, + precision_score, + accuracy_score, + matthews_corrcoef, + recall_score, +) + from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -86,21 +95,21 @@ def write_to_training_log(self, message: str): except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows): + def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ Train a model based on the flows we receive and the labels """ try: + # Create y_flow with the label + y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - # Normalize this batch of data so far. This can get progressivle slow + # Normalize this batch of data so far. This can get progressively slow X_flow = self.scaler.fit_transform(X_flow) # Count the number of labels of each type in this epoc @@ -120,18 +129,43 @@ def train(self, sum_labeled_flows): self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # See score so far in training - score = self.clf.score(X_flow, y_flow) + # Predict on the training data + y_pred = self.clf.predict(X_flow) - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + # For metrics, let's focus on Malicious vs Benign (ignore Background) + mask = (y_flow == "Malicious") | (y_flow == "Benign") + y_true_bin = y_flow[mask] + y_pred_bin = y_pred[mask] + + # Map to binary: Malicious=1, Benign=0 + y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) + y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + + # Compute confusion matrix: tn, fp, fn, tp + tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + + # Compute metrics + FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 + TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 + TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 + FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 + F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) + PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) + ACCU = accuracy_score(y_true_bin, y_pred_bin) + MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk self.store_model() # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + self.write_to_training_log( + f"Total labels: {sum_labeled_flows}, " + f"Background: {epoch_label_counts['Background']}. " + f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + ) except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) @@ -520,7 +554,7 @@ def main(self): # for pandas self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train(sum_labeled_flows) + self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From addd26bc0cf43e5426fd63b5dd73962c78b898dd Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sun, 4 May 2025 12:50:46 +0000 Subject: [PATCH 440/455] Fix experiment names --- modules/flowmldetection/plot_train_performance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 5212dfeeaf..304f0f4ead 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number): ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') ax1.set_ylabel('Label Counts') - # No log scale here ax1.set_title(f'Label Counts - Experiment {experiment_number}') ax1.legend() ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) # --- Plot 2: FNR and FPR (log scale) --- fig2, ax2 = plt.subplots(figsize=(10, 6)) @@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number): ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') ax2.legend() ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) # --- Plot 3: Other metrics (log scale) --- fig3, ax3 = plt.subplots(figsize=(12, 7)) @@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number): ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') ax3.legend() ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) plt.show() From 01a6450fcf21b60387711cf5d2dc55800aabd5dc Mon Sep 17 00:00:00 2001 From: alya Date: Mon, 5 May 2025 15:24:12 +0300 Subject: [PATCH 441/455] test_profiler: update unit tests --- tests/test_profiler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_profiler.py b/tests/test_profiler.py index 36733d2b8c..465bc5922b 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -467,7 +467,6 @@ def test_read_configuration( mock_conf.local_whitelist_path.return_value = "path/to/whitelist" mock_conf.ts_format.return_value = "unixtimestamp" mock_conf.analysis_direction.return_value = "all" - mock_conf.label.return_value = "malicious" mock_conf.get_tw_width_as_float.return_value = 1.0 mock_conf.client_ips.return_value = ["192.168.1.1", "10.0.0.1"] @@ -476,7 +475,6 @@ def test_read_configuration( assert profiler.local_whitelist_path == "path/to/whitelist" assert profiler.timeformat == "unixtimestamp" assert profiler.analysis_direction == "all" - assert profiler.label == "malicious" assert profiler.width == 1.0 assert profiler.client_ips == ["192.168.1.1", "10.0.0.1"] From 99a276f9caae1a8621146209d2bfdefa756a0297 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Mon, 5 May 2025 16:43:05 +0000 Subject: [PATCH 442/455] Fix that the training and testing logs files were appened instead of rewritten --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2c60cd4034..9a920b4e25 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -90,7 +90,7 @@ def write_to_training_log(self, message: str): Write a message to the training log file. """ try: - with open(self.training_log_path, "a") as log_file: + with open(self.training_log_path, "w") as log_file: log_file.write(message + "\n") except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) @@ -610,8 +610,7 @@ def main(self): testing_log_path = "./modules/flowmldetection/testing_performance.log" try: - with open(testing_log_path, "a") as log_file: - log_file.write("Testing Performance Log Initialized\n") + with open(testing_log_path, "w") as log_file: # Log the testing performance metrics log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") From cb22b3103a300fce293bf9ab34355d774f6a2b5d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Mon, 5 May 2025 22:45:16 +0000 Subject: [PATCH 443/455] Fix an issue of storing the new log files --- modules/flowmldetection/flowmldetection.py | 49 ++++++++++------------ 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9a920b4e25..9139066f08 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -72,11 +72,19 @@ def init(self): self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") + self.init_log_file() + + def init_log_file(self): + """ + Init the log file for training or testing + """ + if self.mode == "train": + # Initialize the training log file + self.log_path = "./modules/flowmldetection/training.log" + elif self.mode == "test": + # Initialize the testing log file + self.log_path = "./modules/flowmldetection/testing.log" + self.log_file = open(self.log_path, "w") def read_configuration(self): conf = ConfigParser() @@ -85,15 +93,14 @@ def read_configuration(self): # in case the flows do not have a label themselves self.label = conf.label() - def write_to_training_log(self, message: str): + def write_to_log(self, message: str): """ - Write a message to the training log file. + Write a message to the local log file. """ try: - with open(self.training_log_path, "w") as log_file: - log_file.write(message + "\n") + self.log_file.write(message + "\n") except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) + self.print(f"Error writing to log: {e}", 0, 1) def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ @@ -159,7 +166,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.store_model() # Log training information - self.write_to_training_log( + self.write_to_log( f"Total labels: {sum_labeled_flows}, " f"Background: {epoch_label_counts['Background']}. " f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " @@ -169,7 +176,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") + self.write_to_log("Error occurred during training.") def process_features(self, dataset): """ @@ -597,7 +604,6 @@ def main(self): if not hasattr(self, 'fn'): self.fn = 0 - # Update counters based on predictions and labels if pred[0] == "Malicious" and original_label == "Malicious": self.tp += 1 @@ -605,19 +611,10 @@ def main(self): self.tn += 1 elif pred[0] == "Malicious" and original_label == "Benign": self.fp += 1 + self.write_to_log(f"False Positive Flow: {self.flow}") elif pred[0] == "Benign" and original_label == "Malicious": self.fn += 1 + self.write_to_log(f"False Negative Flow: {self.flow}") - testing_log_path = "./modules/flowmldetection/testing_performance.log" - try: - with open(testing_log_path, "w") as log_file: - # Log the testing performance metrics - log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") - - # Log the original flow for false positives and false negatives - if pred[0] == "Malicious" and original_label == "Benign": - log_file.write(f"False Positive Flow: {self.flow}\n") - elif pred[0] == "Benign" and original_label == "Malicious": - log_file.write(f"False Negative Flow: {self.flow}\n") - except Exception as e: - self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file + # Log the testing performance metrics + self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}") \ No newline at end of file From e0cc7c2f946a8fb4db664bbbc42422e6c54458a7 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:21:08 +0300 Subject: [PATCH 444/455] enable/ disable training and testing.log with a param in the config file --- .secrets.baseline | 6 +- config/slips.yaml | 3 + modules/flowmldetection/flowmldetection.py | 140 +++++++++++++------- modules/riskiq/riskiq.py | 2 +- modules/update_manager/update_manager.py | 2 +- slips_files/common/parsers/config_parser.py | 7 +- 6 files changed, 109 insertions(+), 51 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index fc1ac4872e..aa5615109c 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -149,14 +149,14 @@ "filename": "config/slips.yaml", "hashed_secret": "4cac50cee3ad8e462728e711eac3e670753d5016", "is_verified": false, - "line_number": 223 + "line_number": 226 }, { "type": "Secret Keyword", "filename": "config/slips.yaml", "hashed_secret": "d033e22ae348aeb5660fc2140aec35850c4da997", "is_verified": false, - "line_number": 393 + "line_number": 396 } ], "dataset/test14-malicious-zeek-dir/http.log": [ @@ -7192,5 +7192,5 @@ } ] }, - "generated_at": "2025-05-08T14:51:28Z" + "generated_at": "2025-05-10T13:18:46Z" } diff --git a/config/slips.yaml b/config/slips.yaml index 1b73e7b549..ac2010e6b4 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -213,6 +213,9 @@ flowmldetection: # You should have trained at least once with 'Normal' data and once with # 'Malicious' data in order for the test to work. mode: test + # creates an extra log file called training.log/testing.log in the + # ouptput dir with performance metrics depending on the mode. + create_performance_metrics_log_files: False ############################# virustotal: diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9139066f08..2a515d0cfa 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,9 +10,8 @@ import json import traceback import warnings -from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import confusion_matrix from sklearn.metrics import ( - confusion_matrix, f1_score, precision_score, accuracy_score, @@ -37,6 +36,7 @@ Method, ) + # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass @@ -73,7 +73,7 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" self.init_log_file() - + def init_log_file(self): """ Init the log file for training or testing @@ -92,11 +92,16 @@ def read_configuration(self): # This is the global label in the configuration, # in case the flows do not have a label themselves self.label = conf.label() + self.enable_logs: bool = conf.create_performance_metrics_log_files() def write_to_log(self, message: str): """ - Write a message to the local log file. + Write a message to the local log file if + create_performance_metrics_log_files is enabled in slips.yaml """ + if not self.enable_logs: + return + try: self.log_file.write(message + "\n") except Exception as e: @@ -108,7 +113,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ try: # Create y_flow with the label - y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) + y_flow = numpy.full( + self.flows.shape[0], self.flows.ground_truth_label + ) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels @@ -130,7 +137,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): try: # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + X_flow, + y_flow, + classes=["Background", "Malicious", "Benign"], ) except Exception: self.print("Error while calling clf.train()") @@ -149,7 +158,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) # Compute confusion matrix: tn, fp, fn, tp - tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + tn, fp, fn, tp = ( + confusion_matrix(y_true_bin, y_pred_bin, labels=[0, 1]).ravel() + if len(set(y_true_bin)) > 1 + else (0, 0, 0, 0) + ) # Compute metrics FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 @@ -159,7 +172,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) ACCU = accuracy_score(y_true_bin, y_pred_bin) - MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + MCC = ( + matthews_corrcoef(y_true_bin, y_pred_bin) + if len(set(y_true_bin)) > 1 + else 0 + ) RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk @@ -189,7 +206,8 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + # If te proto is in the list to delete and there is only one flow, + # then the dataset will be empty if dataset.empty: # DataFrame is empty now, so return empty return dataset @@ -295,7 +313,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): if last_number_of_flows_when_trained is None: last_number_of_flows_when_trained = 0 else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + last_number_of_flows_when_trained = int( + last_number_of_flows_when_trained + ) # We get all the flows so far flows = self.db.get_all_flows() @@ -399,21 +419,21 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' + """ [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + """ # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -540,17 +560,19 @@ def main(self): labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - # The min labels to retrain is the min number of flows + # The min labels to retrain is the min number of flows # we should have seen so far in this capture to start training # This is so we dont _start_ training with only 1 flow - # Once we are over the start minimum, the second condition is + # Once we are over the start minimum, the second condition is # to force to retrain every a minimum_labels_to_retrain number # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + if sum_labeled_flows >= self.minimum_labels_to_start_train: + if ( + sum_labeled_flows + - self.last_number_of_flows_when_trained + >= self.minimum_labels_to_retrain + ): # So for example we retrain every 50 labels and only when # we have at least 50 labels self.print( @@ -559,10 +581,17 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) + self.process_training_flows( + self.last_number_of_flows_when_trained + ) # Train an algorithm - self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) - self.last_number_of_flows_when_trained = sum_labeled_flows + self.train( + sum_labeled_flows, + self.last_number_of_flows_when_trained, + ) + self.last_number_of_flows_when_trained = ( + sum_labeled_flows + ) elif self.mode == "test": # We are testing, which means using the model to detect @@ -570,7 +599,9 @@ def main(self): # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: - original_label = processed_flow["ground_truth_label"].iloc[0] + original_label = processed_flow["ground_truth_label"].iloc[ + 0 + ] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: @@ -591,30 +622,49 @@ def main(self): 2, ) - # So you can disable this code easily. Since it is used only for evaluating a testing + # So you can disable this code easily. Since it is used + # only for evaluating a testing log_testing_data = True if log_testing_data: # Initialize counters if not already done - if not hasattr(self, 'tp'): + if not hasattr(self, "tp"): self.tp = 0 - if not hasattr(self, 'tn'): + if not hasattr(self, "tn"): self.tn = 0 - if not hasattr(self, 'fp'): + if not hasattr(self, "fp"): self.fp = 0 - if not hasattr(self, 'fn'): + if not hasattr(self, "fn"): self.fn = 0 # Update counters based on predictions and labels - if pred[0] == "Malicious" and original_label == "Malicious": + if ( + pred[0] == "Malicious" + and original_label == "Malicious" + ): self.tp += 1 - elif pred[0] == "Benign" and original_label == "Benign": + elif ( + pred[0] == "Benign" and original_label == "Benign" + ): self.tn += 1 - elif pred[0] == "Malicious" and original_label == "Benign": + elif ( + pred[0] == "Malicious" + and original_label == "Benign" + ): self.fp += 1 - self.write_to_log(f"False Positive Flow: {self.flow}") - elif pred[0] == "Benign" and original_label == "Malicious": + self.write_to_log( + f"False Positive Flow: {self.flow}" + ) + elif ( + pred[0] == "Benign" + and original_label == "Malicious" + ): self.fn += 1 - self.write_to_log(f"False Negative Flow: {self.flow}") + self.write_to_log( + f"False Negative Flow: {self.flow}" + ) # Log the testing performance metrics - self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}") \ No newline at end of file + self.write_to_log( + f"TP: {self.tp}, TN: {self.tn}," + f" FP: {self.fp}, FN: {self.fn}" + ) diff --git a/modules/riskiq/riskiq.py b/modules/riskiq/riskiq.py index 5abf2ddb19..7b5653997e 100644 --- a/modules/riskiq/riskiq.py +++ b/modules/riskiq/riskiq.py @@ -25,7 +25,7 @@ def init(self): def read_configuration(self): conf = ConfigParser() - risk_iq_credentials_path = conf.RiskIQ_credentials_path() + risk_iq_credentials_path = conf.risk_iq_credentials_path() try: with open(risk_iq_credentials_path, "r") as f: self.riskiq_email = f.readline().replace("\n", "") diff --git a/modules/update_manager/update_manager.py b/modules/update_manager/update_manager.py index ba8106aa5c..b791bfc137 100644 --- a/modules/update_manager/update_manager.py +++ b/modules/update_manager/update_manager.py @@ -119,7 +119,7 @@ def read_riskiq_creds(risk_iq_credentials_path): self.ssl_feeds_path = conf.ssl_feeds() self.ssl_feeds = self.get_feed_details(self.ssl_feeds_path) - risk_iq_credentials_path = conf.RiskIQ_credentials_path() + risk_iq_credentials_path = conf.risk_iq_credentials_path() read_riskiq_creds(risk_iq_credentials_path) self.riskiq_update_period = conf.riskiq_update_period() diff --git a/slips_files/common/parsers/config_parser.py b/slips_files/common/parsers/config_parser.py index 40f1b044bc..e208f78816 100644 --- a/slips_files/common/parsers/config_parser.py +++ b/slips_files/common/parsers/config_parser.py @@ -418,7 +418,12 @@ def data_exfiltration_threshold(self): def get_ml_mode(self): return self.read_configuration("flowmldetection", "mode", "test") - def RiskIQ_credentials_path(self): + def create_performance_metrics_log_files(self) -> bool: + return self.read_configuration( + "flowmldetection", "create_performance_metrics_log_files", False + ) + + def risk_iq_credentials_path(self): return self.read_configuration( "threatintelligence", "RiskIQ_credentials_path", "" ) From adcbafd997d538cf7d8041f6317dd48f3cef0f54 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:23:58 +0300 Subject: [PATCH 445/455] dont create an empty logfile when create_performance_metrics_log_files is set to false --- modules/flowmldetection/flowmldetection.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2a515d0cfa..9305197d3e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -78,6 +78,9 @@ def init_log_file(self): """ Init the log file for training or testing """ + if not self.enable_logs: + return + if self.mode == "train": # Initialize the training log file self.log_path = "./modules/flowmldetection/training.log" From c45e77594002748fdd1e2c5ddd559c92416eb3f5 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:29:30 +0300 Subject: [PATCH 446/455] when enabled, create testing.log or training.log in the current output dir --- modules/flowmldetection/flowmldetection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9305197d3e..f618195bce 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: GPL-2.0-only import numpy +import os from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -83,10 +84,10 @@ def init_log_file(self): if self.mode == "train": # Initialize the training log file - self.log_path = "./modules/flowmldetection/training.log" + self.log_path = os.path.join(self.output_dir, "training.log") elif self.mode == "test": # Initialize the testing log file - self.log_path = "./modules/flowmldetection/testing.log" + self.log_path = os.path.join(self.output_dir, "testing.log") self.log_file = open(self.log_path, "w") def read_configuration(self): From b2452494a0d32f394b5ddc15e5cb6afc47df2855 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:43:32 +0300 Subject: [PATCH 447/455] Add an enum called labels with either Benign or Malicious so the labels are unified. --- modules/flowmldetection/flowmldetection.py | 65 +++++++++++----------- slips_files/core/structures/labels.py | 11 ++++ 2 files changed, 43 insertions(+), 33 deletions(-) create mode 100644 slips_files/core/structures/labels.py diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f618195bce..e828058ee4 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -19,11 +19,10 @@ matthews_corrcoef, recall_score, ) - - from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.labels import Label from slips_files.core.structures.evidence import ( Evidence, ProfileID, @@ -45,6 +44,10 @@ def warn(*args, **kwargs): warnings.warn = warn +BACKGROUND = Label.BACKGROUND.name +BENIGN = Label.BENIGN.name +MALICIOUS = Label.MALICIOUS.name + class FlowMLDetection(IModule): # Name: short name of the module. Do not use spaces @@ -132,9 +135,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): # Count the number of labels of each type in this epoc epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), + BACKGROUND: (y_flow == BACKGROUND).sum(), + MALICIOUS: (y_flow == MALICIOUS).sum(), + BENIGN: (y_flow == BENIGN).sum(), } # Train @@ -143,7 +146,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.clf.partial_fit( X_flow, y_flow, - classes=["Background", "Malicious", "Benign"], + classes=[BACKGROUND, MALICIOUS, BENIGN], ) except Exception: self.print("Error while calling clf.train()") @@ -153,13 +156,13 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): y_pred = self.clf.predict(X_flow) # For metrics, let's focus on Malicious vs Benign (ignore Background) - mask = (y_flow == "Malicious") | (y_flow == "Benign") + mask = (y_flow == MALICIOUS) | (y_flow == BENIGN) y_true_bin = y_flow[mask] y_pred_bin = y_pred[mask] # Map to binary: Malicious=1, Benign=0 - y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) - y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + y_true_bin = numpy.where(y_true_bin == MALICIOUS, 1, 0) + y_pred_bin = numpy.where(y_pred_bin == MALICIOUS, 1, 0) # Compute confusion matrix: tn, fp, fn, tp tn, fp, fn, tp = ( @@ -190,9 +193,12 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.write_to_log( f"Total labels: {sum_labeled_flows}, " f"Background: {epoch_label_counts['Background']}. " - f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " - f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " - f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + f"Benign: {epoch_label_counts['Benign']}. " + f"Malicious: {epoch_label_counts[MALICIOUS]}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, " + f"TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, " + f"Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." ) except Exception: self.print("Error in train().", 0, 1) @@ -345,9 +351,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "ground_truth_label": "Malicious", + "ground_truth_label": MALICIOUS, "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": MALICIOUS }, } ) @@ -366,9 +372,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "ground_truth_label": "Benign", + "ground_truth_label": BENIGN, "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": BENIGN }, } ) @@ -421,7 +427,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # For argus binetflows this fails because ther is a field calle + # bytes that was not in other flows. It should be called allbytes. # Error """ [Flow ML Detection] Error in detect() while processing dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes @@ -546,8 +553,8 @@ def main(self): self.twid = msg["twid"] self.profileid = msg["profileid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them + # These following extra fields are expected in testing. + # update the original flow dict to have them self.flow.update( { "state": msg["interpreted_state"], @@ -612,7 +619,7 @@ def main(self): # an error occurred return - if pred[0] == "Malicious": + if pred[0] == MALICIOUS: # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( @@ -642,26 +649,18 @@ def main(self): # Update counters based on predictions and labels if ( - pred[0] == "Malicious" - and original_label == "Malicious" + pred[0] == MALICIOUS + and original_label == MALICIOUS ): self.tp += 1 - elif ( - pred[0] == "Benign" and original_label == "Benign" - ): + elif pred[0] == BENIGN and original_label == BENIGN: self.tn += 1 - elif ( - pred[0] == "Malicious" - and original_label == "Benign" - ): + elif pred[0] == MALICIOUS and original_label == BENIGN: self.fp += 1 self.write_to_log( f"False Positive Flow: {self.flow}" ) - elif ( - pred[0] == "Benign" - and original_label == "Malicious" - ): + elif pred[0] == BENIGN and original_label == MALICIOUS: self.fn += 1 self.write_to_log( f"False Negative Flow: {self.flow}" diff --git a/slips_files/core/structures/labels.py b/slips_files/core/structures/labels.py new file mode 100644 index 0000000000..b1dc64234e --- /dev/null +++ b/slips_files/core/structures/labels.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class Label(Enum): + """ + label of flows should be one of the following + """ + + MALICIOUS = "Malicious" + BENIGN = "Benign" + BACKGROUND = "Background" From 31a49bdefd4bbb1cfe7834b59dbfc9e137a66418 Mon Sep 17 00:00:00 2001 From: alya Date: Mon, 12 May 2025 20:57:25 +0300 Subject: [PATCH 448/455] set the config label as the GT label if not founf in the given file --- modules/flowmldetection/flowmldetection.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e828058ee4..c2b184cb10 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -98,7 +98,7 @@ def read_configuration(self): self.mode = conf.get_ml_mode() # This is the global label in the configuration, # in case the flows do not have a label themselves - self.label = conf.label() + self.ground_truth_config_label = conf.label() self.enable_logs: bool = conf.create_performance_metrics_log_files() def write_to_log(self, message: str): @@ -610,9 +610,15 @@ def main(self): # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: - original_label = processed_flow["ground_truth_label"].iloc[ - 0 - ] + try: + original_label = processed_flow[ + "ground_truth_label" + ].iloc[0] + except KeyError: + # If there are no labels in the flows, the default + # label should be the one in the config file. + original_label = self.ground_truth_config_label + # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: From a6ad940c2b134f1dd220e07f4f2d16419d545f08 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Tue, 20 May 2025 11:13:27 +0000 Subject: [PATCH 449/455] By default train and store logs --- config/slips.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index ac2010e6b4..635df7f918 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -212,10 +212,10 @@ flowmldetection: # training the models, to test in unknown data. # You should have trained at least once with 'Normal' data and once with # 'Malicious' data in order for the test to work. - mode: test + mode: train # creates an extra log file called training.log/testing.log in the # ouptput dir with performance metrics depending on the mode. - create_performance_metrics_log_files: False + create_performance_metrics_log_files: True ############################# virustotal: From c7ab0a2c2ee14ddc5b009bc4011095b0ae2044f4 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Tue, 20 May 2025 11:13:56 +0000 Subject: [PATCH 450/455] Fix the labels to .value --- modules/flowmldetection/flowmldetection.py | 28 ++++------------------ 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c2b184cb10..4ef661146e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -44,9 +44,9 @@ def warn(*args, **kwargs): warnings.warn = warn -BACKGROUND = Label.BACKGROUND.name -BENIGN = Label.BENIGN.name -MALICIOUS = Label.MALICIOUS.name +BACKGROUND = Label.BACKGROUND.value +BENIGN = Label.BENIGN.value +MALICIOUS = Label.MALICIOUS.value class FlowMLDetection(IModule): @@ -287,7 +287,7 @@ def process_features(self, dataset): r"(^.*arp.*$)", "4", regex=True ) - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["bytes"] = dataset["sbytes"] + dataset["dbytes"] dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] fields_to_convert_to_float = [ @@ -297,7 +297,7 @@ def process_features(self, dataset): dataset.dur, dataset.pkts, dataset.spkts, - dataset.allbytes, + dataset.bytes, dataset.sbytes, dataset.state, ] @@ -427,24 +427,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle - # bytes that was not in other flows. It should be called allbytes. - # Error - """ [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - """ - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 992496bc616e00620b7f37f5043c7941da5b9505 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Thu, 29 May 2025 11:32:43 +0000 Subject: [PATCH 451/455] Fix plot testing bug with zeros --- modules/flowmldetection/plot_testing_performance.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 6865415cdf..dc649b5996 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -121,13 +121,15 @@ def plot_single_group(metrics_dict, output_filename, experiment_number, is_close if is_close_to_0: min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) - + # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + # Ensure ticks are within the valid range + if min_val > 0 and max_val > 0: + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically # Add the experiment number to the plot title plt.xlabel('Index') From fe02fc0e4e476f560e96ad1d2d2d9d99333b0854 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Fri, 11 Jul 2025 09:53:56 +0000 Subject: [PATCH 452/455] Improve the testing of performance --- .../plot_testing_performance.py | 87 ++++++++++++------- 1 file changed, 55 insertions(+), 32 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index dc649b5996..f0f9b8f2d0 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -15,42 +15,59 @@ def process_file(file_path): MCC_values = [] recall_values = [] + # Counters for error tracking + total_lines = 0 + error_lines = 0 + unusual_lines = 0 + # Read the file and extract the data with open(file_path, 'r') as file: for line in file: + total_lines += 1 if "TP:" in line: - # Extract the values from the line - parts = line.split(',') - TP = int(parts[0].split(':')[1].strip()) - TN = int(parts[1].split(':')[1].strip()) - FP = int(parts[2].split(':')[1].strip()) - FN = int(parts[3].split(':')[1].strip()) - - # Calculate metrics - FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 - FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 - TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 - TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 - Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 - Recall = TPR # Recall is the same as TPR - F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 - Accuracy = (TP + TN) / (TP + TN + FP + FN) - MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + try: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 - # Append the values to the respective lists - FPR_values.append(FPR) - FNR_values.append(FNR) - TNR_values.append(TNR) - TPR_values.append(TPR) - F1_values.append(F1) - accuracy_values.append(Accuracy) - precision_values.append(Precision) - MCC_values.append(MCC) - recall_values.append(Recall) - - return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): + except Exception as e: + error_lines += 1 + print(f"Error in line {total_lines}: {e}") + continue + + # Check for any unusual cases + if any(np.isnan([FPR, FNR, TNR, TPR, F1, Accuracy, Precision, MCC, Recall])): + unusual_lines += 1 + print(f"Unusual values in line {total_lines}: NaN values found") + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, total_lines, error_lines, unusual_lines + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number, total_lines, error_lines, unusual_lines): # Separate the values into two groups based on their proximity to 0 or 1 close_to_0 = { 'FPR': [], 'FNR': [] @@ -90,6 +107,12 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu print(f"Final MCC: {MCC_values[-1]:.4f}") print(f"Final Recall: {recall_values[-1]:.4f}") + # Print summary statistics + print(f"\nSummary for Experiment {experiment_number}:") + print(f"Total lines read: {total_lines}") + print(f"Lines with errors: {error_lines}") + print(f"Unusual lines (NaN values): {unusual_lines}") + def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): plt.figure(figsize=(12, 8)) @@ -152,8 +175,8 @@ def main(): file_path = args.file experiment_number = args.experiment - FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, total_lines, error_lines, unusual_lines = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number, total_lines, error_lines, unusual_lines) if __name__ == "__main__": main() From 480c398dc578d11863e23c8d6a11f22f920286e1 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Fri, 3 Oct 2025 08:30:48 +0000 Subject: [PATCH 453/455] Update temporary model after latest train --- modules/flowmldetection/model.bin | Bin 1124 -> 1376 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index a6648cf72179520975b0e9ad1164f7d574e87140..bb6d471c12e299775e308ecffe6231d20a68614d 100644 GIT binary patch delta 468 zcmV;_0W1FG2;d3@fCQCQ1+fJq0$BrvlxQQ21!Q!2aAlN}l#ByaF));gh=-I?lqgFB zi~&59PEJlsC`$xO2y~QUi9>io0001C0001E0001M0001I0001T0001Q0001W0001P z0001F0001!SOO%IcmgN^0F#{pG64gV&;lq47ytkO0002+Ka=?aE(8Nh3x<=?0w#Y# z2mt^9VH!esD)l#8~zU<=kitBd)tjQifd=%5y{8nf}gTK#q0ZbP0w1eN4Z6B^3D zdT>qBmRh|(&ibEGe#&}E6zhWwhH_|!Xzdg^FWtRK}z^k-B5~rHQKeI~2 z#*byUzf-~~nj}@lKeZDj7-4AfKMO=*3#Sg@KLd69ceIuFzi*DKYLgSvzsU|S%gVDl zGljIUu^^-@k!@qUochmOqo;15^Y- K6~ly+HUut&JHl=N delta 217 zcmV;~04D$73gid{fCQCU1hEAp0x$xFlxQQ21!Q!2aAlN}l#BvZIh2Wrhm=y3C`$v3 z0X&pWPEJcCO9V>@bd+L=Lr9Z>0wj~E0wj~s0xtmqlK}%L0T7ci11o zcI-ba#L|>%Hv~ZN`Y9Z*;7 TCnUg=+5}VtK<@DXlQIP^UwKv4 From d4675c0a4f4304e4bcb6720a4371da90560d38e7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Fri, 3 Oct 2025 08:30:57 +0000 Subject: [PATCH 454/455] Update temporary scaler after latest train --- modules/flowmldetection/scaler.bin | Bin 890 -> 887 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 17115724b9536f6093f9d72f3b58a5c22c562a9a..a62890d0b0b019a02a84ad2b0ff861c378dba82a 100644 GIT binary patch delta 302 zcmeyx_MMHTfn{pWMivJ~M%Kw5jM+@g43oDq`iEzv)G}OnDCEEZ1*=!BT9qaG!XXp{ zd}l!DP-O>@_^MSJV08^!kAV4E{CA-GKG=hli<&-$^35hkGpU7NJYc>4w~{ebjDg`q zq0DhrK0U``nd5~&w|Y7Pxj^LV@Ytg7u?1f|$V{Qizq1{IqSrhQ9d~4af+CsYu?JKR zIfC8bRXq6ylScT-MS}l2>II=@f3QElNKZzGx!>VV#JT@}*UfQ|l(tvmOOtZwy)EWZ u@1^G8G%-@4Tl1*HUdK1qhbG;I>tj5<_++uf4Tp96OU;%{G@KmHtPB8KS8H7W delta 305 zcmV-10nYyS2KojBfCQCqu>?Q?1PEblY?DO+YXW-(ld=I%SGXcJ>b7ZMK%g&y^bA{P zKSt)3zw#dLK%p_vF&##uK+P0N9`1!QK(D}{{TAjcK;bH{z@TriKo{mo(f{JIK;I}} zgOE7xKYWP%NKmQ4K$1F*&U`#TlUM>KSKJP<820ERK@bAa4gpGMKR>OMx}hRxK{M=6 zv(jsGK{@t2q?-|%KnIzEbxq24Kq}lf4=@0pLA97c_0;#zK?}1s3)$hSKi+W2x_dU# zL4x+q5;T^nlhpz!SCCH4+`&DGK%YiiXq`-}KS?vLDu6YsK$Vk(#kQcaKz&31K1^&x zK*urfpL`fIKz5|BUcczYK%idWH|qP}K%c;h+h$bFKXB%NfD#hnK+$a-p+B8WlT-sE DDj1Bc From 9c6ae9162ad8df4f556165612aadb3b6f9a6ee24 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Wed, 5 Nov 2025 10:25:24 +0000 Subject: [PATCH 455/455] New model.bin model --- modules/flowmldetection/model.bin | Bin 1376 -> 1376 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index bb6d471c12e299775e308ecffe6231d20a68614d..5c305d38340560647ab0e165c12a91177a9d41c4 100644 GIT binary patch delta 312 zcmV-80muH}3g8N`j{|?BD=C{t`QN{#yJevcJj_2ybu_XJg4w^76fCY!tn9xXHX@t1 z{iHwPHgJQ>Y`VYL@|+q%rm{a3Ii`iVs`$Uu?0UhEDc-*yHfRqPN9R8_+6GX)Wy8O2 zC$u)|R@*-n0CRdz_4_}n-!$F1yw$%4YG6}T-r_$rb|o2CA>@BQ5AIQ~iIwfY$MbSJ zVV2mxXQf&Dc#If8uJsVkVUyiI|5cp@RGtvP;S0ro!nI!mH#zrC>~QO7Hf+Gf|1O zaKg*K)D2>CGBOF_zXq44Qa#zwla>T57vA?Iw?vHrz|C3+)w#UkztYwS^oJV3li37U K0+<++G6gQ4zMo+L delta 312 zcmV-80muH}3g8N`j{|>U8bWw0AmhK3i?+L93)VlYi}|pO``*9kpcb(jv+=)L{dL@K zL!LhbmE=zo8p^+Va81&dTD?Ec`m*bNZ1=ykQKa&fwB)~t2az1Ffa^bFA$Fn!da}Qq zF9bzVHs(LG;y;yeA^<>K zlM~Xv$uG;wvpNanzxN<$=R$T57of3qB|PQ=z@o=FAiCt=zkz|G>7%8Vli37U K0>gxpG6gPim7N*@