From 43ca52bccee44def9d203246a8223b29a4ec47cc Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Sun, 2 Feb 2025 15:05:40 +0200
Subject: [PATCH 01/12] Added the option to store results in the 'data' folder.

---
 EM/run_em.py                       | 43 +++++++++++++++++++++++++++++-
 EM/runfile_em_mt.py                |  3 +--
 conf/minimal-em-configuration.json | 11 +++++---
 test_em.py                         | 10 ++++++-
 4 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/EM/run_em.py b/EM/run_em.py
index b13929f..d34e4ee 100644
--- a/EM/run_em.py
+++ b/EM/run_em.py
@@ -8,23 +8,63 @@
 from . import runfile_em_mt
 import numpy as np
 import pathlib
+import gzip
+import json
 
 # import shutil
 
 
+def produce_hpf(conf_file):
+    project_dir = ""
+
+    # Read configuration file and load properties
+    with open(conf_file) as f:
+        conf = json.load(f)
+
+    pops = conf.get("populations")
+    freq_data_dir = project_dir + conf.get("freq_data_dir")
+    output_dir = project_dir + conf.get("graph_files_path")
+    pop_ratio_dir = project_dir + conf.get("pops_count_file")
+
+    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    list_pop_count = []
+
+    for pop in pops:
+        in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz"
+        with gzip.open(in_freq_file, "rb") as zf:
+            count_pop = 0
+            lines = [x.decode("utf8").strip() for x in zf.readlines()]
+            for hap_line in lines:
+                haplotype, count, freq = hap_line.split(",")
+                if haplotype == "Haplo":
+                    continue
+                if float(freq) == 0.0:
+                    continue
+                count_pop += float(count)
+            list_pop_count.append(count_pop)
+
+    sum_pops = sum(list_pop_count)
+
+    with open(pop_ratio_dir, "w") as pop_ratio_file:
+        for pop, ratio in zip(pops, list_pop_count):
+            pop_ratio_file.write("{},{},{}\n".format(pop, ratio, ratio / sum_pops))
+    print(f"Writing pop_counts_file to: {pop_ratio_dir}")
+
+
 def run_em_def(
     conf_file,
     sr_pop_name="all",
 ):
 
     project_dir = "../"
-    output_dir = "output/"
     # photos_dir = 'photos/'
     graph_generation_dir = "../imputation/graph_generation/"
     # Read configuration file and load properties
     with open(conf_file) as f:
         json_conf = json.load(f)
     graph_files_path = json_conf.get("graph_files_path")
+    output_dir = json_conf.get("output_dir", "output")
     config = {
         "imputation_input_file": json_conf.get("imputation_in_file"),
         "freq_file": json_conf.get("freq_file"),
@@ -145,3 +185,4 @@ def run_em_def(
     os.remove(config["info_nodes"])
 
     file_lo.write("loglikelihood " + str(logL) + "\n")
+    produce_hpf(conf_file)
diff --git a/EM/runfile_em_mt.py b/EM/runfile_em_mt.py
index 7a72327..288c19a 100644
--- a/EM/runfile_em_mt.py
+++ b/EM/runfile_em_mt.py
@@ -36,12 +36,11 @@ def run(
     project_dir = ""
 
     # project_dir = ""# "../"
-    output_dir = "output/"
 
     # Read configuration file and load properties
     with open(config_file) as f:
         json_conf = json.load(f)
-
+    output_dir = json_conf.get("output_dir", "output")
     graph_files_path = json_conf.get("graph_files_path")
     if graph_files_path[-1] != "/":
         graph_files_path += "/"
diff --git a/conf/minimal-em-configuration.json b/conf/minimal-em-configuration.json
index dfdff2d..f9d338d 100644
--- a/conf/minimal-em-configuration.json
+++ b/conf/minimal-em-configuration.json
@@ -13,7 +13,7 @@
     "DQB1": 4,
     "DRB1": 5
   },
-     "graph_files_path": "output/" ,
+     "graph_files_path": "data/" ,
     "node_csv_file": "nodes.csv",
   "edges_csv_file": "edges.csv",
   "info_node_csv_file": "info_node.csv",
@@ -30,7 +30,7 @@
 
 
   "imputation_out_hap_freq_filename": "don.hap.freqs",
-  "freq_file": "output/hpf.csv",
+  "freq_file": "data/hpf.csv",
 
     "priority": {
     "alpha": 0.4999999,
@@ -49,10 +49,13 @@
 
   "init_cutoff": 100,
   "max_iterations": 50,
-  "logLikelihood_file": "output/log_likelihood.txt",
+  "logLikelihood_file": "data/log_likelihood.txt",
   "memory_min_size": 100000000,
   "memory_max_size": 600000000,
   "num_threads": 1,
-  "run_just_SR_EM": false
+  "run_just_SR_EM": false,
+  "freq_data_dir": "data/freqs",
+  "pops_count_file": "data/pop_counts_file.txt",
+  "output_dir": "data/"
 
 }
diff --git a/test_em.py b/test_em.py
index f788841..a7ba253 100644
--- a/test_em.py
+++ b/test_em.py
@@ -22,10 +22,18 @@
 #    > http://www.opensource.org/licenses/lgpl-license.php
 #
 
+import numpy as np
+import json
+# Workaround for NumPy 2.0 removing np.float_
+np.float_ = np.float64
+
 import os
 from EM.run_em import run_em_def
 
 if __name__ == "__main__":
-    os.makedirs(f"output", exist_ok=True)
     conf_file = "conf/minimal-em-configuration.json"
+    with open(conf_file) as f:
+        json_conf = json.load(f)
+    output_dir = json_conf.get("output_dir", "output")
+    os.makedirs(output_dir, exist_ok=True)
     run_em_def(conf_file)

From d9a1aad9e90ca5b8539945c330024ba38e5de6e4 Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Sun, 16 Feb 2025 15:57:39 +0200
Subject: [PATCH 02/12] Backup before release as a package

---
 filter.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 index.py  | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 filter.py
 create mode 100644 index.py

diff --git a/filter.py b/filter.py
new file mode 100644
index 0000000..214a180
--- /dev/null
+++ b/filter.py
@@ -0,0 +1,58 @@
+import csv
+import os
+import shutil
+
+
+def remove(input_csv_path, strings_to_remove):
+    # Determine directory and backup file path.
+    directory = os.path.dirname(input_csv_path)
+    backup_csv_path = os.path.join(directory, "donor_becup.csv")
+
+    # Create a backup copy of donor.csv
+    shutil.copy(input_csv_path, backup_csv_path)
+    print(f"Backup created: {backup_csv_path}")
+
+    # Dictionary to count removals for each substring.
+    removal_counts = {substr: 0 for substr in strings_to_remove}
+
+    updated_rows = []
+
+    # Read the original donor.csv
+    with open(input_csv_path, newline='', encoding='utf-8') as infile:
+        reader = csv.reader(infile)
+        for row in reader:
+            # Process only if the row has at least 2 columns.
+            if len(row) >= 2:
+                # Split the second column by '^'
+                elements = row[1].split('^')
+                new_elements = []
+                for element in elements:
+                    remove_flag = False
+                    # Check if element contains any substring from strings_to_remove.
+                    for substr in strings_to_remove:
+                        if substr in element:
+                            removal_counts[substr] += 1
+                            remove_flag = True
+                    if not remove_flag:
+                        new_elements.append(element)
+                # Reassemble the second column.
+                row[1] = "^".join(new_elements)
+            # Add (modified or not) the row to our updated_rows list.
+            updated_rows.append(row)
+
+    # Overwrite donor.csv with the updated rows.
+    with open(input_csv_path, "w", newline='', encoding='utf-8') as outfile:
+        writer = csv.writer(outfile)
+        writer.writerows(updated_rows)
+
+    # Print summary of removals for each substring.
+    print("\nRemoval summary:")
+    for substr, count in removal_counts.items():
+        print(f"{substr}: removed {count}")
+
+
+# Example usage:
+if __name__ == "__main__":
+    donor_csv_path = "data/subjects/donor.csv"
+    strings_to_remove = ['DRBX', 'DRB3','DRB4','DRB5','DQA1','DPA1','DPB1']
+    remove(donor_csv_path, strings_to_remove)
diff --git a/index.py b/index.py
new file mode 100644
index 0000000..104fbbf
--- /dev/null
+++ b/index.py
@@ -0,0 +1,42 @@
+import csv
+
+
+def update_first_column_with_numbering(input_csv_path):
+    """
+    Reads the CSV file from input_csv_path.
+    For each row, replaces the first column with a sequential number starting at 1.
+    Overwrites the original CSV with the updated rows.
+
+    For example, if the original rows are:
+        hjsdh,...
+        kuayj,...
+        kjaha,...
+        husd,...
+    They will become:
+        1,...
+        2,...
+        3,...
+        4,...
+    """
+    updated_rows = []
+
+    # Read all rows from the original CSV.
+    with open(input_csv_path, newline='', encoding='utf-8') as infile:
+        reader = csv.reader(infile)
+        for index, row in enumerate(reader, start=1):
+            if row:  # Only process non-empty rows
+                row[0] = str(index)
+            updated_rows.append(row)
+
+    # Overwrite the CSV with the updated rows.
+    with open(input_csv_path, "w", newline='', encoding='utf-8') as outfile:
+        writer = csv.writer(outfile)
+        writer.writerows(updated_rows)
+
+    print(f"Updated the first column with numbering in {input_csv_path}.")
+
+
+# Example usage:
+if __name__ == "__main__":
+    donor_csv_path = "data/subjects/donor.csv"
+    update_first_column_with_numbering(donor_csv_path)

From 35e70a6c19aa3767b72aa29dd5f48565e556508f Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Sun, 16 Feb 2025 15:59:46 +0200
Subject: [PATCH 03/12] Backup before release as a package

---
 EM/run_em.py                                |  23 +-
 EM/runfile_em_mt.py                         | 227 +++++++-------------
 conf/minimal-em-configuration.json          |  25 ++-
 data/{ct_mr_don_10.txt => donors/donor.txt} |   0
 4 files changed, 115 insertions(+), 160 deletions(-)
 rename data/{ct_mr_don_10.txt => donors/donor.txt} (100%)

diff --git a/EM/run_em.py b/EM/run_em.py
index d34e4ee..1ad90c4 100644
--- a/EM/run_em.py
+++ b/EM/run_em.py
@@ -10,9 +10,10 @@
 import pathlib
 import gzip
 import json
+np.float_ = np.float64
 
 # import shutil
-
+import os
 
 def produce_hpf(conf_file):
     project_dir = ""
@@ -64,7 +65,7 @@ def run_em_def(
     with open(conf_file) as f:
         json_conf = json.load(f)
     graph_files_path = json_conf.get("graph_files_path")
-    output_dir = json_conf.get("output_dir", "output")
+    output_dir = json_conf.get("output_dir", "data")
     config = {
         "imputation_input_file": json_conf.get("imputation_in_file"),
         "freq_file": json_conf.get("freq_file"),
@@ -179,10 +180,26 @@ def run_em_def(
                 not_converge = False
         print("Log Likelihood: " + str(logL))
 
+    pop_counts_file_path = json_conf.get("pops_count_file")  # Use json_conf, not conf_file
+
+    # Ensure the directory exists
+    pop_counts_dir = os.path.dirname(pop_counts_file_path)
+    os.makedirs(pop_counts_dir, exist_ok=True)
+
+    # Open the file for writing (this will overwrite any existing file).
+    with open(pop_counts_file_path, "w") as f:
+        # Iterate over the populations and the corresponding count values.
+        # The second column is left blank as per your requirement.
+        for pop, norm in zip(json_conf.get("populations"), count_by_prob):
+            raw_count = norm * num_saples
+            f.write("{},{},{}\n".format(pop, raw_count, norm))
+
+    print("Pop counts file generated at: {}".format(pop_counts_file_path))
+
     os.remove(config["node_file"])
     os.remove(config["top_links_file"])
     os.remove(config["edges_file"])
     os.remove(config["info_nodes"])
 
     file_lo.write("loglikelihood " + str(logL) + "\n")
-    produce_hpf(conf_file)
+    #
diff --git a/EM/runfile_em_mt.py b/EM/runfile_em_mt.py
index 288c19a..7a5ea81 100644
--- a/EM/runfile_em_mt.py
+++ b/EM/runfile_em_mt.py
@@ -1,27 +1,19 @@
 import argparse
-
-# import cProfile
 import json
 import pathlib
-
+import glob
 import sys
 import os
-import numpy as np
-
-# import threading
-from multiprocessing import Process
-import string, math
-import copy
+from os.path import exists
 
+import numpy as np
+np.float_ = np.float64
 import multiprocessing
-
-# The default MP method is spawn, use "fork"
 multiprocessing.set_start_method("fork")
-
 from grim import grim
+import math
+import string, copy
 
-
-# Profiler start
 def run(
     plan_b,
     config_file,
@@ -32,22 +24,13 @@ def run(
     num_subjects=None,
     project_dir_graph="",
 ):
-
     project_dir = ""
-
-    # project_dir = ""# "../"
-
-    # Read configuration file and load properties
     with open(config_file) as f:
         json_conf = json.load(f)
-    output_dir = json_conf.get("output_dir", "output")
+    output_dir = json_conf.get("output_dir", "data")
     graph_files_path = json_conf.get("graph_files_path")
     if graph_files_path[-1] != "/":
         graph_files_path += "/"
-    # output_dir = json_conf.get("imuptation_out_path", "output")
-    # output_dir = "output/"
-    # Read configuration file and load properties
-
     config = {
         "planb": json_conf.get("planb", True),
         "pops": json_conf.get("populations"),
@@ -57,56 +40,31 @@ def run(
         "number_of_pop_results": json_conf.get("number_of_pop_results", 100),
         "output_MUUG": json_conf.get("output_MUUG", False),
         "output_haplotypes": json_conf.get("output_haplotypes", True),
-        "node_file": project_dir_graph
-        + graph_files_path
-        + json_conf.get("node_csv_file"),
-        "top_links_file": project_dir_graph
-        + graph_files_path
-        + json_conf.get("top_links_csv_file"),
-        "edges_file": project_dir_graph
-        + graph_files_path
-        + json_conf.get("edges_csv_file"),
+        "node_file": project_dir_graph + graph_files_path + json_conf.get("node_csv_file"),
+        "top_links_file": project_dir_graph + graph_files_path + json_conf.get("top_links_csv_file"),
+        "edges_file": project_dir_graph + graph_files_path + json_conf.get("edges_csv_file"),
         "imputation_input_file": json_conf.get("imputation_in_file"),
-        "imputation_out_umug_freq_file": output_dir
-        + json_conf.get("imputation_out_umug_freq_filename", "None"),
-        "imputation_out_umug_pops_file": output_dir
-        + json_conf.get("imputation_out_umug_pops_filename", "None"),
-        "imputation_out_hap_freq_file": output_dir
-        + json_conf.get("imputation_out_hap_freq_filename"),
-        "imputation_out_hap_pops_file": output_dir
-        + json_conf.get("imputation_out_hap_pops_filename", "None"),
-        "imputation_out_miss_file": output_dir
-        + json_conf.get("imputation_out_miss_filename", "miss.txt"),
-        "imputation_out_problem_file": output_dir
-        + json_conf.get("imputation_out_problem_filename", "problem.txt"),
+        "imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename", "None"),
+        "imputation_out_umug_pops_file": output_dir + json_conf.get("imputation_out_umug_pops_filename", "None"),
+        "imputation_out_hap_freq_file": output_dir + json_conf.get("imputation_out_hap_freq_filename"),
+        "imputation_out_hap_pops_file": output_dir + json_conf.get("imputation_out_hap_pops_filename", "None"),
+        "imputation_out_miss_file": output_dir + json_conf.get("imputation_out_miss_filename", "miss.txt"),
+        "imputation_out_problem_file": output_dir + json_conf.get("imputation_out_problem_filename", "problem.txt"),
         "factor_missing_data": json_conf.get("factor_missing_data", 0.01),
-        "loci_map": json_conf.get(
-            "loci_map", {"A": "A", "B": "B", "C": "C", "DQB1": "Q", "DRB1": "R"}
-        ),
-        "loci_map": json_conf.get(
-            "loci_map", {"A": 1, "B": 3, "C": 2, "DQB1": 4, "DRB1": 5}
-        ),
-        "matrix_planb": json_conf.get(
-            "Plan_B_Matrix",
-            [
+        "loci_map": json_conf.get("loci_map", {"A": 1, "B": 3, "C": 2, "DQB1": 4, "DRB1": 5}),
+        "matrix_planb": json_conf.get("Plan_B_Matrix", [
                 [[1, 2, 3, 4, 5]],
                 [[1, 2, 3], [4, 5]],
                 [[1], [2, 3], [4, 5]],
                 [[1, 2, 3], [4], [5]],
                 [[1], [2, 3], [4], [5]],
                 [[1], [2], [3], [4], [5]],
-            ],
-        ),
+            ]),
         "pops_count_file": project_dir + json_conf.get("pops_count_file", ""),
         "use_pops_count_file": json_conf.get("pops_count_file", False),
-        "number_of_options_threshold": json_conf.get(
-            "number_of_options_threshold", 100000
-        ),
-        "max_haplotypes_number_in_phase": json_conf.get(
-            "max_haplotypes_number_in_phase", 100
-        ),
-        "bin_imputation_input_file": project_dir
-        + json_conf.get("bin_imputation_in_file", "None"),
+        "number_of_options_threshold": json_conf.get("number_of_options_threshold", 100000),
+        "max_haplotypes_number_in_phase": json_conf.get("max_haplotypes_number_in_phase", 100),
+        "bin_imputation_input_file": project_dir + json_conf.get("bin_imputation_in_file", "None"),
         "num_thread": json_conf.get("num_threads", 1),
         "nodes_for_plan_A": json_conf.get("Plan_A_Matrix", []),
         "save_mode": json_conf.get("save_space_mode", False),
@@ -116,17 +74,12 @@ def run(
     for _, val in config["loci_map"].items():
         all_loci_set.add(str(val))
     config["full_loci"] = "".join(sorted(all_loci_set))
-    config["imputation_out_hap_pops_file"] = (
-        config["imputation_out_hap_pops_file"] + str(iteration) + ".txt"
-    )
+    config["imputation_out_hap_pops_file"] = config["imputation_out_hap_pops_file"] + str(iteration) + ".txt"
 
     if pop:
         config["pops"] = pop
 
-    # Display the configurations we are using
-    print(
-        "****************************************************************************************************"
-    )
+    print("****************************************************************************************************")
     print("Performing imputation based on:")
     print("\tPopulation: {}".format(config["pops"]))
     print("\tPriority: {}".format(config["priority"]))
@@ -138,23 +91,11 @@ def run(
     print("\tTop Links File: {}".format(config["edges_file"]))
     print("\tInput File: {}".format(config["imputation_input_file"]))
     print("\tOutput UMUG Format: {}".format(config["output_MUUG"]))
-    print(
-        "\tOutput UMUG Freq Filename: {}".format(
-            config["imputation_out_umug_freq_file"]
-        )
-    )
-    print(
-        "\tOutput UMUG Pops Filename: {}".format(
-            config["imputation_out_umug_pops_file"]
-        )
-    )
+    print("\tOutput UMUG Freq Filename: {}".format(config["imputation_out_umug_freq_file"]))
+    print("\tOutput UMUG Pops Filename: {}".format(config["imputation_out_umug_pops_file"]))
     print("\tOutput Haplotype Format: {}".format(config["output_haplotypes"]))
-    print(
-        "\tOutput HAP Freq Filename: {}".format(config["imputation_out_hap_freq_file"])
-    )
-    print(
-        "\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"])
-    )
+    print("\tOutput HAP Freq Filename: {}".format(config["imputation_out_hap_freq_file"]))
+    print("\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"]))
     print("\tOutput Miss Filename: {}".format(config["imputation_out_miss_file"]))
     print("\tOutput Problem Filename: {}".format(config["imputation_out_problem_file"]))
     print("\tFactor Missing Data: {}".format(config["factor_missing_data"]))
@@ -162,37 +103,22 @@ def run(
     print("\tPlan B Matrix: {}".format(config["matrix_planb"]))
     print("\tPops Count File: {}".format(config["pops_count_file"]))
     print("\tUse Pops Count File: {}".format(config["use_pops_count_file"]))
-    print(
-        "\tNumber of Options Threshold: {}".format(
-            config["number_of_options_threshold"]
-        )
-    )
-    print(
-        "\tMax Number of haplotypes in phase: {}".format(
-            config["max_haplotypes_number_in_phase"]
-        )
-    )
+    print("\tNumber of Options Threshold: {}".format(config["number_of_options_threshold"]))
+    print("\tMax Number of haplotypes in phase: {}".format(config["max_haplotypes_number_in_phase"]))
     if config["nodes_for_plan_A"]:
         print("\tNodes in plan A: {}".format(config["nodes_for_plan_A"]))
     print("\tSave space mode: {}".format(config["save_mode"]))
-    print(
-        "****************************************************************************************************"
-    )
+    print("****************************************************************************************************")
 
-    # Perform imputation
+    # Create graph instance
     graph = grim.graph_instance(config)
     imputation_list = []
-
-    # Create output directory if it doesn't exist
     pathlib.Path(output_dir).mkdir(parents=False, exist_ok=True)
 
     input_file = config["imputation_input_file"]
     in_dir = os.path.dirname(input_file)
     if in_dir == "":
         in_dir = "."
-    # if project_dir_in_file != "":
-    #     in_dir = "splited_data_for_em"
-    #     pathlib.Path(in_dir).mkdir(parents=False, exist_ok=True)
     in_file_basename = os.path.basename(input_file)
 
     if not num_subjects:
@@ -200,66 +126,73 @@ def run(
         num_subjects = int(num_subjects.strip().split(" ")[0]) + 1
     print(num_subjects)
 
-    output_dir = config.get("graph_files_path", "output")
+    # Create output directory for split files
+    output_ct = "output_ct"
+    os.makedirs(output_ct, exist_ok=True)
+    # Use output_ct directory directly rather than combining with in_dir
+    split_prefix = os.path.join(output_ct, in_file_basename[0:2])
     split_cmd = (
-        "split  -l "
+        "split -l "
         + str(int(math.ceil(num_subjects / config["num_thread"])))
         + " "
         + input_file
         + " "
-        + in_dir
-        + "/"
-        + f"../{output_dir}/{in_file_basename[0:2]}"
+        + split_prefix
     )
     print(f"Split Command: {split_cmd}")
     os.system(split_cmd)
 
     alpha = string.ascii_lowercase
-    therads_list = list()
+    threads_list = []
     config_list = []
     for i in range(config["num_thread"]):
         imputation = grim.impute_instance(config, graph, count_by_prob=count_by_prob)
         imputation_list.append(imputation)
-        in_file = (
-            in_dir
-            + "/"
-            + f"../{output_dir}/{in_file_basename[0:2]}"
-            + alpha[int(i / 26)]
-            + alpha[int(i % 26)]
-        )
+        # Construct the split file name using the split_prefix
+        in_file = split_prefix + alpha[int(i / 26)] + alpha[int(i % 26)]
         print(in_file)
-        output_file = output_dir + "/" + os.path.basename(in_file) + "_out"
+        output_file = os.path.join(output_ct, os.path.basename(in_file) + "_out")
         config_list.append(copy.deepcopy(config))
         config_list[i]["imputation_input_file"] = in_file
         config_list[i]["imputation_out_hap_freq_file"] = output_file
 
     for i in range(config["num_thread"]):
-        t = Process(
+        t = multiprocessing.Process(
             target=imputation_list[i].impute_file,
-            args=(
-                config_list[i],
-                plan_b,
-                em_mr,
-                True,
-            ),
+            args=(config_list[i], plan_b, em_mr, True),
         )
-        therads_list.append(t)
-    for i in range(config["num_thread"]):
-        therads_list[i].start()
-    for i in range(config["num_thread"]):
-        therads_list[i].join()
-    for i in range(config["num_thread"]):
-        therads_list[i].terminate()
-
-    f_out = open(config["imputation_out_hap_freq_file"], "w")
-    for i in range(config["num_thread"]):
-
-        with open(config_list[i]["imputation_out_hap_freq_file"]) as f_t_out:
-            for line in f_t_out:
-                f_out.write(line)
-            f_t_out.close()
-            os.remove(config_list[i]["imputation_out_hap_freq_file"])
-
-    # Profiler end
-    # pr.disable()
-    # pr.print_stats(sort="time")
+        threads_list.append(t)
+    for t in threads_list:
+        t.start()
+    for t in threads_list:
+        t.join()
+    for t in threads_list:
+        t.terminate()
+
+    with open(config["imputation_out_hap_freq_file"], "w") as f_out:
+        for i in range(config["num_thread"]):
+            with open(config_list[i]["imputation_out_hap_freq_file"]) as f_t_out:
+                for line in f_t_out:
+                    f_out.write(line)
+                f_t_out.close()
+                os.remove(config_list[i]["imputation_out_hap_freq_file"])
+
+    print("Starting cleanup of temporary split files.")
+    temp_file_prefix = in_file_basename[0:2]
+    temp_files_pattern = os.path.join(output_ct, temp_file_prefix + "*")
+    temp_files = glob.glob(temp_files_pattern)
+    if temp_files:
+        for temp_file in temp_files:
+            print(f"Removing temporary file: {temp_file}")
+            try:
+                os.remove(temp_file)
+            except Exception as e:
+                print(f"Error removing file {temp_file}: {e}")
+        print("All temporary files removed.")
+    else:
+        print("No temporary files found with pattern:", temp_files_pattern)
+    try:
+        os.rmdir(output_ct)
+        print(f"Temporary directory '{output_ct}' removed.")
+    except Exception as e:
+        print(f"Error removing directory '{output_ct}': {e}")
diff --git a/conf/minimal-em-configuration.json b/conf/minimal-em-configuration.json
index f9d338d..2895865 100644
--- a/conf/minimal-em-configuration.json
+++ b/conf/minimal-em-configuration.json
@@ -1,7 +1,8 @@
 {
   "populations": [
-    "AAFA",
-    "CAU"
+    "CAU",
+    "AAFA"
+
   ],
 
   "FULL_LOCI": "ABCQR",
@@ -13,12 +14,13 @@
     "DQB1": 4,
     "DRB1": 5
   },
-     "graph_files_path": "data/" ,
-    "node_csv_file": "nodes.csv",
+  "graph_files_path": "data/" ,
+  "node_csv_file": "nodes.csv",
   "edges_csv_file": "edges.csv",
   "info_node_csv_file": "info_node.csv",
   "top_links_csv_file": "top_links.csv",
-  "imputation_in_file": "data/ct_mr_don_10.txt",
+  "imputation_in_file": "data/donors/donor.txt",
+  "ct_files": "data",
   "Plan_B_Matrix": [
             [[1, 2, 3, 4, 5]],
             [[1, 2, 3], [4, 5]],
@@ -30,9 +32,9 @@
 
 
   "imputation_out_hap_freq_filename": "don.hap.freqs",
-  "freq_file": "data/hpf.csv",
+  "freq_file": "output/hpf.csv",
 
-    "priority": {
+  "priority": {
     "alpha": 0.4999999,
     "eta": 0,
     "beta": 1e-7,
@@ -55,7 +57,10 @@
   "num_threads": 1,
   "run_just_SR_EM": false,
   "freq_data_dir": "data/freqs",
-  "pops_count_file": "data/pop_counts_file.txt",
-  "output_dir": "data/"
-
+  "pops_count_file": "output/pop_counts_file.txt",
+  "imputation_out_umug_freq_filename": "umug_freq",
+  "imputation_out_umug_pops_filename": "umug_pops",
+  "imputation_out_hap_pops_filename": "hap_pops",
+  "output_dir": "data/",
+  "output_ct": "output_ct"
 }
diff --git a/data/ct_mr_don_10.txt b/data/donors/donor.txt
similarity index 100%
rename from data/ct_mr_don_10.txt
rename to data/donors/donor.txt

From 8a1c32da61daee45fb909bcc2458d37578fe485b Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Sun, 16 Feb 2025 17:01:53 +0200
Subject: [PATCH 04/12] updated requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4112647..4050952 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
 toml==0.10.2
-py-graph-imputation==0.0.12
+py-graph-imputation>=0.0.12

From 89b7529fdbb397e13afab411f05d2f963acb74cd Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Sun, 16 Feb 2025 19:28:23 +0200
Subject: [PATCH 05/12] hopefully now we could import and use test_em when this
 repo is pip installed in other repos

---
 MANIFEST.in | 17 +++++++++++++
 setup.py    | 72 ++++++++++++++++++++++-------------------------------
 2 files changed, 47 insertions(+), 42 deletions(-)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..dbf0cda
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,17 @@
+include LICENSE
+include README.md
+include HISTORY.rst
+include requirements.txt
+include requirements-tests.txt
+
+# Include test_em.py explicitly
+include test_em.py
+
+# Include all files from the EM package and its subdirectories
+recursive-include EM *
+
+# Ensure all non-Python files (e.g., data files) inside `EM` are included
+recursive-include EM *.csv *.txt *.json *.yaml *.yml *.cfg
+
+# Include additional configuration files if applicable
+include setup.cfg
diff --git a/setup.py b/setup.py
index fb2202a..496bde9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,50 +1,35 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-#
-#    py-graph-em Graph EM
-#    Copyright (c) 2021 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
-#
-#    This library is free software; you can redistribute it and/or modify it
-#    under the terms of the GNU Lesser General Public License as published
-#    by the Free Software Foundation; either version 3 of the License, or (at
-#    your option) any later version.
-#
-#    This library is distributed in the hope that it will be useful, but WITHOUT
-#    ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
-#    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-#    License for more details.
-#
-#    You should have received a copy of the GNU Lesser General Public License
-#    along with this library;  if not, write to the Free Software Foundation,
-#    Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA.
-#
-#    > http://www.fsf.org/licensing/licenses/lgpl.html
-#    > http://www.opensource.org/licenses/lgpl-license.php
-#
-
-
-"""The setup script."""
-
+import os
 from setuptools import setup, find_packages
 
-with open("README.md") as readme_file:
-    readme = readme_file.read()
+# Get the absolute path to this directory.
+here = os.path.abspath(os.path.dirname(__file__))
 
-with open("HISTORY.rst") as history_file:
-    history = history_file.read()
+# Read the contents of README.md for the long description.
+with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
+    readme = f.read()
 
-with open("requirements.txt") as requirements_file:
-    requirements = requirements_file.read().split("\n")
+# Try to read HISTORY.rst; if it doesn't exist, use an empty string.
+try:
+    with open(os.path.join(here, "HISTORY.rst"), encoding="utf-8") as f:
+        history = f.read()
+except FileNotFoundError:
+    history = ""
 
-with open("requirements-tests.txt") as requirements_file:
-    test_requirements = requirements_file.read().split("\n")
+# Read the requirements and test requirements files.
+with open(os.path.join(here, "requirements.txt"), encoding="utf-8") as f:
+    requirements = f.read().splitlines()
+
+with open(os.path.join(here, "requirements-tests.txt"), encoding="utf-8") as f:
+    test_requirements = f.read().splitlines()
 
 setup(
-    name="py-graph-em",
-    version="0.0.3",
-    author="Pradeep Bashyal",
-    author_email="pbashyal@nmdp.org",
+    name="py-graph-EM",
+    version="0.0.1",
+    author="Regev Yehezkel Imra",
+    author_email="regevel2006@gmail.com",
     python_requires=">=3.8",
     classifiers=[
         "Development Status :: 2 - Pre-Alpha",
@@ -56,15 +41,18 @@
         "Programming Language :: Python :: 3.10",
     ],
     description="Graph Based EM",
-    install_requires=requirements,
-    license="LGPL 3.0",
     long_description=readme + "\n\n" + history,
     long_description_content_type="text/markdown",
-    include_package_data=True,
+    install_requires=requirements,
+    license="LGPL 3.0",
     keywords="Graph, EM",
-    packages=find_packages(include=["EM"]),
+    # Explicitly include the 'EM' package and any subpackages.
+    packages=find_packages(include=["EM", "EM.*"]),
+    include_package_data=True,
+    # Include test_em.py as a script so it gets installed in the environment’s bin/ folder.
+    scripts=["test_em.py"],
     test_suite="tests",
     tests_require=test_requirements,
-    url="https://github.com/nmdp-bioinformatics/py-graph-em",
+    url="https://github.com/Regev32/py-graph-em",
     zip_safe=False,
 )

From 2afefc342c4f774fec937c82f05199e148e1eee9 Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Sun, 16 Feb 2025 19:42:43 +0200
Subject: [PATCH 06/12] hopefully now we could import and use test_em when this
 repo is pip installed in other repos

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 496bde9..e3316c6 100644
--- a/setup.py
+++ b/setup.py
@@ -48,6 +48,7 @@
     keywords="Graph, EM",
     # Explicitly include the 'EM' package and any subpackages.
     packages=find_packages(include=["EM", "EM.*"]),
+    py_modules=["test_em"],
     include_package_data=True,
     # Include test_em.py as a script so it gets installed in the environment’s bin/ folder.
     scripts=["test_em.py"],

From 25a0ca4745fe5b1b9ee11c9a3428aae9f0851fa3 Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Sun, 16 Feb 2025 19:53:14 +0200
Subject: [PATCH 07/12] Fix setup.py to include test_em.py as a module

---
 setup.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index e3316c6..13ba1a0 100644
--- a/setup.py
+++ b/setup.py
@@ -4,21 +4,17 @@
 import os
 from setuptools import setup, find_packages
 
-# Get the absolute path to this directory.
 here = os.path.abspath(os.path.dirname(__file__))
 
-# Read the contents of README.md for the long description.
 with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
     readme = f.read()
 
-# Try to read HISTORY.rst; if it doesn't exist, use an empty string.
 try:
     with open(os.path.join(here, "HISTORY.rst"), encoding="utf-8") as f:
         history = f.read()
 except FileNotFoundError:
     history = ""
 
-# Read the requirements and test requirements files.
 with open(os.path.join(here, "requirements.txt"), encoding="utf-8") as f:
     requirements = f.read().splitlines()
 
@@ -46,12 +42,11 @@
     install_requires=requirements,
     license="LGPL 3.0",
     keywords="Graph, EM",
-    # Explicitly include the 'EM' package and any subpackages.
+    # Explicitly include the 'EM' package and its subpackages.
     packages=find_packages(include=["EM", "EM.*"]),
-    py_modules=["test_em"],
     include_package_data=True,
-    # Include test_em.py as a script so it gets installed in the environment’s bin/ folder.
-    scripts=["test_em.py"],
+    # Remove the scripts argument and include test_em.py as a module:
+    py_modules=["test_em"],
     test_suite="tests",
     tests_require=test_requirements,
     url="https://github.com/Regev32/py-graph-em",

From fc614eba3cbd4acd397c2127eb04ce8fb12bacc8 Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Mon, 17 Feb 2025 12:11:29 +0200
Subject: [PATCH 08/12] Fix setup.py to include test_em.py as a module

---
 HISTORY.rst | 2 +-
 MANIFEST.in | 8 ++++----
 __init__.py | 0
 setup.py    | 5 ++---
 4 files changed, 7 insertions(+), 8 deletions(-)
 create mode 100644 __init__.py

diff --git a/HISTORY.rst b/HISTORY.rst
index 93c23ae..5fc601f 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -2,7 +2,7 @@
 History
 =======
 
-0.0.1 (2021-08-25)
+0.0.1 (2025-02-16)
 ------------------
 
 * First release on PyPI.
diff --git a/MANIFEST.in b/MANIFEST.in
index dbf0cda..429228c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,14 +4,14 @@ include HISTORY.rst
 include requirements.txt
 include requirements-tests.txt
 
-# Include test_em.py explicitly
+# Include test_em.py so that it's part of the source distribution.
 include test_em.py
 
-# Include all files from the EM package and its subdirectories
+# Include all files from the EM package and its subdirectories.
 recursive-include EM *
 
-# Ensure all non-Python files (e.g., data files) inside `EM` are included
+# Optionally, include specific non-Python file types from EM:
 recursive-include EM *.csv *.txt *.json *.yaml *.yml *.cfg
 
-# Include additional configuration files if applicable
+# Include additional configuration files if needed.
 include setup.cfg
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/setup.py b/setup.py
index 13ba1a0..2bd158f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 import os
@@ -42,10 +41,10 @@
     install_requires=requirements,
     license="LGPL 3.0",
     keywords="Graph, EM",
-    # Explicitly include the 'EM' package and its subpackages.
+    # Include the main package and its subpackages.
     packages=find_packages(include=["EM", "EM.*"]),
     include_package_data=True,
-    # Remove the scripts argument and include test_em.py as a module:
+    # Install test_em.py as an importable module.
     py_modules=["test_em"],
     test_suite="tests",
     tests_require=test_requirements,

From 5c2f693e9a378cfa391dcf803a382cec12e1a575 Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Tue, 18 Feb 2025 13:58:14 +0200
Subject: [PATCH 09/12] restored everything in order to pull

---
 MANIFEST.in | 17 -------------
 setup.py    | 69 +++++++++++++++++++++++++++++++++--------------------
 2 files changed, 43 insertions(+), 43 deletions(-)
 delete mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 429228c..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,17 +0,0 @@
-include LICENSE
-include README.md
-include HISTORY.rst
-include requirements.txt
-include requirements-tests.txt
-
-# Include test_em.py so that it's part of the source distribution.
-include test_em.py
-
-# Include all files from the EM package and its subdirectories.
-recursive-include EM *
-
-# Optionally, include specific non-Python file types from EM:
-recursive-include EM *.csv *.txt *.json *.yaml *.yml *.cfg
-
-# Include additional configuration files if needed.
-include setup.cfg
diff --git a/setup.py b/setup.py
index 2bd158f..fb2202a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,30 +1,50 @@
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import os
-from setuptools import setup, find_packages
+#
+#    py-graph-em Graph EM
+#    Copyright (c) 2021 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
+#
+#    This library is free software; you can redistribute it and/or modify it
+#    under the terms of the GNU Lesser General Public License as published
+#    by the Free Software Foundation; either version 3 of the License, or (at
+#    your option) any later version.
+#
+#    This library is distributed in the hope that it will be useful, but WITHOUT
+#    ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
+#    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+#    License for more details.
+#
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with this library;  if not, write to the Free Software Foundation,
+#    Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA.
+#
+#    > http://www.fsf.org/licensing/licenses/lgpl.html
+#    > http://www.opensource.org/licenses/lgpl-license.php
+#
+
 
-here = os.path.abspath(os.path.dirname(__file__))
+"""The setup script."""
 
-with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
-    readme = f.read()
+from setuptools import setup, find_packages
 
-try:
-    with open(os.path.join(here, "HISTORY.rst"), encoding="utf-8") as f:
-        history = f.read()
-except FileNotFoundError:
-    history = ""
+with open("README.md") as readme_file:
+    readme = readme_file.read()
 
-with open(os.path.join(here, "requirements.txt"), encoding="utf-8") as f:
-    requirements = f.read().splitlines()
+with open("HISTORY.rst") as history_file:
+    history = history_file.read()
 
-with open(os.path.join(here, "requirements-tests.txt"), encoding="utf-8") as f:
-    test_requirements = f.read().splitlines()
+with open("requirements.txt") as requirements_file:
+    requirements = requirements_file.read().split("\n")
+
+with open("requirements-tests.txt") as requirements_file:
+    test_requirements = requirements_file.read().split("\n")
 
 setup(
-    name="py-graph-EM",
-    version="0.0.1",
-    author="Regev Yehezkel Imra",
-    author_email="regevel2006@gmail.com",
+    name="py-graph-em",
+    version="0.0.3",
+    author="Pradeep Bashyal",
+    author_email="pbashyal@nmdp.org",
     python_requires=">=3.8",
     classifiers=[
         "Development Status :: 2 - Pre-Alpha",
@@ -36,18 +56,15 @@
         "Programming Language :: Python :: 3.10",
     ],
     description="Graph Based EM",
-    long_description=readme + "\n\n" + history,
-    long_description_content_type="text/markdown",
     install_requires=requirements,
     license="LGPL 3.0",
-    keywords="Graph, EM",
-    # Include the main package and its subpackages.
-    packages=find_packages(include=["EM", "EM.*"]),
+    long_description=readme + "\n\n" + history,
+    long_description_content_type="text/markdown",
     include_package_data=True,
-    # Install test_em.py as an importable module.
-    py_modules=["test_em"],
+    keywords="Graph, EM",
+    packages=find_packages(include=["EM"]),
     test_suite="tests",
     tests_require=test_requirements,
-    url="https://github.com/Regev32/py-graph-em",
+    url="https://github.com/nmdp-bioinformatics/py-graph-em",
     zip_safe=False,
 )

From 646bed8ed8fd89c75abe1e986b1243b66abfb89a Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Tue, 18 Feb 2025 14:05:16 +0200
Subject: [PATCH 10/12] deleted "produce_hpf"

---
 EM/run_em.py | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/EM/run_em.py b/EM/run_em.py
index 1ad90c4..39e2f7f 100644
--- a/EM/run_em.py
+++ b/EM/run_em.py
@@ -15,43 +15,6 @@
 # import shutil
 import os
 
-def produce_hpf(conf_file):
-    project_dir = ""
-
-    # Read configuration file and load properties
-    with open(conf_file) as f:
-        conf = json.load(f)
-
-    pops = conf.get("populations")
-    freq_data_dir = project_dir + conf.get("freq_data_dir")
-    output_dir = project_dir + conf.get("graph_files_path")
-    pop_ratio_dir = project_dir + conf.get("pops_count_file")
-
-    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
-
-    list_pop_count = []
-
-    for pop in pops:
-        in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz"
-        with gzip.open(in_freq_file, "rb") as zf:
-            count_pop = 0
-            lines = [x.decode("utf8").strip() for x in zf.readlines()]
-            for hap_line in lines:
-                haplotype, count, freq = hap_line.split(",")
-                if haplotype == "Haplo":
-                    continue
-                if float(freq) == 0.0:
-                    continue
-                count_pop += float(count)
-            list_pop_count.append(count_pop)
-
-    sum_pops = sum(list_pop_count)
-
-    with open(pop_ratio_dir, "w") as pop_ratio_file:
-        for pop, ratio in zip(pops, list_pop_count):
-            pop_ratio_file.write("{},{},{}\n".format(pop, ratio, ratio / sum_pops))
-    print(f"Writing pop_counts_file to: {pop_ratio_dir}")
-
 
 def run_em_def(
     conf_file,

From c1aa7b903893dd7bd94e4c4186e0918601a93a4c Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Tue, 18 Feb 2025 14:10:36 +0200
Subject: [PATCH 11/12] preparing to pull

---
 HISTORY.rst                        |  2 +-
 conf/minimal-em-configuration.json | 26 +++++---------
 filter.py                          | 58 ------------------------------
 index.py                           | 42 ----------------------
 4 files changed, 10 insertions(+), 118 deletions(-)
 delete mode 100644 filter.py
 delete mode 100644 index.py

diff --git a/HISTORY.rst b/HISTORY.rst
index 5fc601f..93c23ae 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -2,7 +2,7 @@
 History
 =======
 
-0.0.1 (2025-02-16)
+0.0.1 (2021-08-25)
 ------------------
 
 * First release on PyPI.
diff --git a/conf/minimal-em-configuration.json b/conf/minimal-em-configuration.json
index 2895865..dfdff2d 100644
--- a/conf/minimal-em-configuration.json
+++ b/conf/minimal-em-configuration.json
@@ -1,8 +1,7 @@
 {
   "populations": [
-    "CAU",
-    "AAFA"
-
+    "AAFA",
+    "CAU"
   ],
 
   "FULL_LOCI": "ABCQR",
@@ -14,13 +13,12 @@
     "DQB1": 4,
     "DRB1": 5
   },
-  "graph_files_path": "data/" ,
-  "node_csv_file": "nodes.csv",
+     "graph_files_path": "output/" ,
+    "node_csv_file": "nodes.csv",
   "edges_csv_file": "edges.csv",
   "info_node_csv_file": "info_node.csv",
   "top_links_csv_file": "top_links.csv",
-  "imputation_in_file": "data/donors/donor.txt",
-  "ct_files": "data",
+  "imputation_in_file": "data/ct_mr_don_10.txt",
   "Plan_B_Matrix": [
             [[1, 2, 3, 4, 5]],
             [[1, 2, 3], [4, 5]],
@@ -34,7 +32,7 @@
   "imputation_out_hap_freq_filename": "don.hap.freqs",
   "freq_file": "output/hpf.csv",
 
-  "priority": {
+    "priority": {
     "alpha": 0.4999999,
     "eta": 0,
     "beta": 1e-7,
@@ -51,16 +49,10 @@
 
   "init_cutoff": 100,
   "max_iterations": 50,
-  "logLikelihood_file": "data/log_likelihood.txt",
+  "logLikelihood_file": "output/log_likelihood.txt",
   "memory_min_size": 100000000,
   "memory_max_size": 600000000,
   "num_threads": 1,
-  "run_just_SR_EM": false,
-  "freq_data_dir": "data/freqs",
-  "pops_count_file": "output/pop_counts_file.txt",
-  "imputation_out_umug_freq_filename": "umug_freq",
-  "imputation_out_umug_pops_filename": "umug_pops",
-  "imputation_out_hap_pops_filename": "hap_pops",
-  "output_dir": "data/",
-  "output_ct": "output_ct"
+  "run_just_SR_EM": false
+
 }
diff --git a/filter.py b/filter.py
deleted file mode 100644
index 214a180..0000000
--- a/filter.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import csv
-import os
-import shutil
-
-
-def remove(input_csv_path, strings_to_remove):
-    # Determine directory and backup file path.
-    directory = os.path.dirname(input_csv_path)
-    backup_csv_path = os.path.join(directory, "donor_becup.csv")
-
-    # Create a backup copy of donor.csv
-    shutil.copy(input_csv_path, backup_csv_path)
-    print(f"Backup created: {backup_csv_path}")
-
-    # Dictionary to count removals for each substring.
-    removal_counts = {substr: 0 for substr in strings_to_remove}
-
-    updated_rows = []
-
-    # Read the original donor.csv
-    with open(input_csv_path, newline='', encoding='utf-8') as infile:
-        reader = csv.reader(infile)
-        for row in reader:
-            # Process only if the row has at least 2 columns.
-            if len(row) >= 2:
-                # Split the second column by '^'
-                elements = row[1].split('^')
-                new_elements = []
-                for element in elements:
-                    remove_flag = False
-                    # Check if element contains any substring from strings_to_remove.
-                    for substr in strings_to_remove:
-                        if substr in element:
-                            removal_counts[substr] += 1
-                            remove_flag = True
-                    if not remove_flag:
-                        new_elements.append(element)
-                # Reassemble the second column.
-                row[1] = "^".join(new_elements)
-            # Add (modified or not) the row to our updated_rows list.
-            updated_rows.append(row)
-
-    # Overwrite donor.csv with the updated rows.
-    with open(input_csv_path, "w", newline='', encoding='utf-8') as outfile:
-        writer = csv.writer(outfile)
-        writer.writerows(updated_rows)
-
-    # Print summary of removals for each substring.
-    print("\nRemoval summary:")
-    for substr, count in removal_counts.items():
-        print(f"{substr}: removed {count}")
-
-
-# Example usage:
-if __name__ == "__main__":
-    donor_csv_path = "data/subjects/donor.csv"
-    strings_to_remove = ['DRBX', 'DRB3','DRB4','DRB5','DQA1','DPA1','DPB1']
-    remove(donor_csv_path, strings_to_remove)
diff --git a/index.py b/index.py
deleted file mode 100644
index 104fbbf..0000000
--- a/index.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import csv
-
-
-def update_first_column_with_numbering(input_csv_path):
-    """
-    Reads the CSV file from input_csv_path.
-    For each row, replaces the first column with a sequential number starting at 1.
-    Overwrites the original CSV with the updated rows.
-
-    For example, if the original rows are:
-        hjsdh,...
-        kuayj,...
-        kjaha,...
-        husd,...
-    They will become:
-        1,...
-        2,...
-        3,...
-        4,...
-    """
-    updated_rows = []
-
-    # Read all rows from the original CSV.
-    with open(input_csv_path, newline='', encoding='utf-8') as infile:
-        reader = csv.reader(infile)
-        for index, row in enumerate(reader, start=1):
-            if row:  # Only process non-empty rows
-                row[0] = str(index)
-            updated_rows.append(row)
-
-    # Overwrite the CSV with the updated rows.
-    with open(input_csv_path, "w", newline='', encoding='utf-8') as outfile:
-        writer = csv.writer(outfile)
-        writer.writerows(updated_rows)
-
-    print(f"Updated the first column with numbering in {input_csv_path}.")
-
-
-# Example usage:
-if __name__ == "__main__":
-    donor_csv_path = "data/subjects/donor.csv"
-    update_first_column_with_numbering(donor_csv_path)

From 321a9897d88a24f934092446ebea761a37b82b5a Mon Sep 17 00:00:00 2001
From: yehezkr7 <regevel2006@gmail.com>
Date: Tue, 18 Feb 2025 14:15:01 +0200
Subject: [PATCH 12/12] preparing to pull

---
 data/{donors/donor.txt => ct_mr_don_10.txt} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename data/{donors/donor.txt => ct_mr_don_10.txt} (100%)

diff --git a/data/donors/donor.txt b/data/ct_mr_don_10.txt
similarity index 100%
rename from data/donors/donor.txt
rename to data/ct_mr_don_10.txt