From 43ca52bccee44def9d203246a8223b29a4ec47cc Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Sun, 2 Feb 2025 15:05:40 +0200 Subject: [PATCH 01/12] Added the option to store results in the 'data' folder. --- EM/run_em.py | 43 +++++++++++++++++++++++++++++- EM/runfile_em_mt.py | 3 +-- conf/minimal-em-configuration.json | 11 +++++--- test_em.py | 10 ++++++- 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/EM/run_em.py b/EM/run_em.py index b13929f..d34e4ee 100644 --- a/EM/run_em.py +++ b/EM/run_em.py @@ -8,23 +8,63 @@ from . import runfile_em_mt import numpy as np import pathlib +import gzip +import json # import shutil +def produce_hpf(conf_file): + project_dir = "" + + # Read configuration file and load properties + with open(conf_file) as f: + conf = json.load(f) + + pops = conf.get("populations") + freq_data_dir = project_dir + conf.get("freq_data_dir") + output_dir = project_dir + conf.get("graph_files_path") + pop_ratio_dir = project_dir + conf.get("pops_count_file") + + pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) + + list_pop_count = [] + + for pop in pops: + in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz" + with gzip.open(in_freq_file, "rb") as zf: + count_pop = 0 + lines = [x.decode("utf8").strip() for x in zf.readlines()] + for hap_line in lines: + haplotype, count, freq = hap_line.split(",") + if haplotype == "Haplo": + continue + if float(freq) == 0.0: + continue + count_pop += float(count) + list_pop_count.append(count_pop) + + sum_pops = sum(list_pop_count) + + with open(pop_ratio_dir, "w") as pop_ratio_file: + for pop, ratio in zip(pops, list_pop_count): + pop_ratio_file.write("{},{},{}\n".format(pop, ratio, ratio / sum_pops)) + print(f"Writing pop_counts_file to: {pop_ratio_dir}") + + def run_em_def( conf_file, sr_pop_name="all", ): project_dir = "../" - output_dir = "output/" # photos_dir = 'photos/' graph_generation_dir = "../imputation/graph_generation/" # Read configuration file and load properties with open(conf_file) as f: json_conf = json.load(f) graph_files_path = json_conf.get("graph_files_path") + output_dir = json_conf.get("output_dir", "output") config = { "imputation_input_file": json_conf.get("imputation_in_file"), "freq_file": json_conf.get("freq_file"), @@ -145,3 +185,4 @@ def run_em_def( os.remove(config["info_nodes"]) file_lo.write("loglikelihood " + str(logL) + "\n") + produce_hpf(conf_file) diff --git a/EM/runfile_em_mt.py b/EM/runfile_em_mt.py index 7a72327..288c19a 100644 --- a/EM/runfile_em_mt.py +++ b/EM/runfile_em_mt.py @@ -36,12 +36,11 @@ def run( project_dir = "" # project_dir = ""# "../" - output_dir = "output/" # Read configuration file and load properties with open(config_file) as f: json_conf = json.load(f) - + output_dir = json_conf.get("output_dir", "output") graph_files_path = json_conf.get("graph_files_path") if graph_files_path[-1] != "/": graph_files_path += "/" diff --git a/conf/minimal-em-configuration.json b/conf/minimal-em-configuration.json index dfdff2d..f9d338d 100644 --- a/conf/minimal-em-configuration.json +++ b/conf/minimal-em-configuration.json @@ -13,7 +13,7 @@ "DQB1": 4, "DRB1": 5 }, - "graph_files_path": "output/" , + "graph_files_path": "data/" , "node_csv_file": "nodes.csv", "edges_csv_file": "edges.csv", "info_node_csv_file": "info_node.csv", @@ -30,7 +30,7 @@ "imputation_out_hap_freq_filename": "don.hap.freqs", - "freq_file": "output/hpf.csv", + "freq_file": "data/hpf.csv", "priority": { "alpha": 0.4999999, @@ -49,10 +49,13 @@ "init_cutoff": 100, "max_iterations": 50, - "logLikelihood_file": "output/log_likelihood.txt", + "logLikelihood_file": "data/log_likelihood.txt", "memory_min_size": 100000000, "memory_max_size": 600000000, "num_threads": 1, - "run_just_SR_EM": false + "run_just_SR_EM": false, + "freq_data_dir": "data/freqs", + "pops_count_file": "data/pop_counts_file.txt", + "output_dir": "data/" } diff --git a/test_em.py b/test_em.py index f788841..a7ba253 100644 --- a/test_em.py +++ b/test_em.py @@ -22,10 +22,18 @@ # > http://www.opensource.org/licenses/lgpl-license.php # +import numpy as np +import json +# Workaround for NumPy 2.0 removing np.float_ +np.float_ = np.float64 + import os from EM.run_em import run_em_def if __name__ == "__main__": - os.makedirs(f"output", exist_ok=True) conf_file = "conf/minimal-em-configuration.json" + with open(conf_file) as f: + json_conf = json.load(f) + output_dir = json_conf.get("output_dir", "output") + os.makedirs(output_dir, exist_ok=True) run_em_def(conf_file) From d9a1aad9e90ca5b8539945c330024ba38e5de6e4 Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Sun, 16 Feb 2025 15:57:39 +0200 Subject: [PATCH 02/12] Backup before release as a package --- filter.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ index.py | 42 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 filter.py create mode 100644 index.py diff --git a/filter.py b/filter.py new file mode 100644 index 0000000..214a180 --- /dev/null +++ b/filter.py @@ -0,0 +1,58 @@ +import csv +import os +import shutil + + +def remove(input_csv_path, strings_to_remove): + # Determine directory and backup file path. + directory = os.path.dirname(input_csv_path) + backup_csv_path = os.path.join(directory, "donor_becup.csv") + + # Create a backup copy of donor.csv + shutil.copy(input_csv_path, backup_csv_path) + print(f"Backup created: {backup_csv_path}") + + # Dictionary to count removals for each substring. + removal_counts = {substr: 0 for substr in strings_to_remove} + + updated_rows = [] + + # Read the original donor.csv + with open(input_csv_path, newline='', encoding='utf-8') as infile: + reader = csv.reader(infile) + for row in reader: + # Process only if the row has at least 2 columns. + if len(row) >= 2: + # Split the second column by '^' + elements = row[1].split('^') + new_elements = [] + for element in elements: + remove_flag = False + # Check if element contains any substring from strings_to_remove. + for substr in strings_to_remove: + if substr in element: + removal_counts[substr] += 1 + remove_flag = True + if not remove_flag: + new_elements.append(element) + # Reassemble the second column. + row[1] = "^".join(new_elements) + # Add (modified or not) the row to our updated_rows list. + updated_rows.append(row) + + # Overwrite donor.csv with the updated rows. + with open(input_csv_path, "w", newline='', encoding='utf-8') as outfile: + writer = csv.writer(outfile) + writer.writerows(updated_rows) + + # Print summary of removals for each substring. + print("\nRemoval summary:") + for substr, count in removal_counts.items(): + print(f"{substr}: removed {count}") + + +# Example usage: +if __name__ == "__main__": + donor_csv_path = "data/subjects/donor.csv" + strings_to_remove = ['DRBX', 'DRB3','DRB4','DRB5','DQA1','DPA1','DPB1'] + remove(donor_csv_path, strings_to_remove) diff --git a/index.py b/index.py new file mode 100644 index 0000000..104fbbf --- /dev/null +++ b/index.py @@ -0,0 +1,42 @@ +import csv + + +def update_first_column_with_numbering(input_csv_path): + """ + Reads the CSV file from input_csv_path. + For each row, replaces the first column with a sequential number starting at 1. + Overwrites the original CSV with the updated rows. + + For example, if the original rows are: + hjsdh,... + kuayj,... + kjaha,... + husd,... + They will become: + 1,... + 2,... + 3,... + 4,... + """ + updated_rows = [] + + # Read all rows from the original CSV. + with open(input_csv_path, newline='', encoding='utf-8') as infile: + reader = csv.reader(infile) + for index, row in enumerate(reader, start=1): + if row: # Only process non-empty rows + row[0] = str(index) + updated_rows.append(row) + + # Overwrite the CSV with the updated rows. + with open(input_csv_path, "w", newline='', encoding='utf-8') as outfile: + writer = csv.writer(outfile) + writer.writerows(updated_rows) + + print(f"Updated the first column with numbering in {input_csv_path}.") + + +# Example usage: +if __name__ == "__main__": + donor_csv_path = "data/subjects/donor.csv" + update_first_column_with_numbering(donor_csv_path) From 35e70a6c19aa3767b72aa29dd5f48565e556508f Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Sun, 16 Feb 2025 15:59:46 +0200 Subject: [PATCH 03/12] Backup before release as a package --- EM/run_em.py | 23 +- EM/runfile_em_mt.py | 227 +++++++------------- conf/minimal-em-configuration.json | 25 ++- data/{ct_mr_don_10.txt => donors/donor.txt} | 0 4 files changed, 115 insertions(+), 160 deletions(-) rename data/{ct_mr_don_10.txt => donors/donor.txt} (100%) diff --git a/EM/run_em.py b/EM/run_em.py index d34e4ee..1ad90c4 100644 --- a/EM/run_em.py +++ b/EM/run_em.py @@ -10,9 +10,10 @@ import pathlib import gzip import json +np.float_ = np.float64 # import shutil - +import os def produce_hpf(conf_file): project_dir = "" @@ -64,7 +65,7 @@ def run_em_def( with open(conf_file) as f: json_conf = json.load(f) graph_files_path = json_conf.get("graph_files_path") - output_dir = json_conf.get("output_dir", "output") + output_dir = json_conf.get("output_dir", "data") config = { "imputation_input_file": json_conf.get("imputation_in_file"), "freq_file": json_conf.get("freq_file"), @@ -179,10 +180,26 @@ def run_em_def( not_converge = False print("Log Likelihood: " + str(logL)) + pop_counts_file_path = json_conf.get("pops_count_file") # Use json_conf, not conf_file + + # Ensure the directory exists + pop_counts_dir = os.path.dirname(pop_counts_file_path) + os.makedirs(pop_counts_dir, exist_ok=True) + + # Open the file for writing (this will overwrite any existing file). + with open(pop_counts_file_path, "w") as f: + # Iterate over the populations and the corresponding count values. + # The second column is left blank as per your requirement. + for pop, norm in zip(json_conf.get("populations"), count_by_prob): + raw_count = norm * num_saples + f.write("{},{},{}\n".format(pop, raw_count, norm)) + + print("Pop counts file generated at: {}".format(pop_counts_file_path)) + os.remove(config["node_file"]) os.remove(config["top_links_file"]) os.remove(config["edges_file"]) os.remove(config["info_nodes"]) file_lo.write("loglikelihood " + str(logL) + "\n") - produce_hpf(conf_file) + # diff --git a/EM/runfile_em_mt.py b/EM/runfile_em_mt.py index 288c19a..7a5ea81 100644 --- a/EM/runfile_em_mt.py +++ b/EM/runfile_em_mt.py @@ -1,27 +1,19 @@ import argparse - -# import cProfile import json import pathlib - +import glob import sys import os -import numpy as np - -# import threading -from multiprocessing import Process -import string, math -import copy +from os.path import exists +import numpy as np +np.float_ = np.float64 import multiprocessing - -# The default MP method is spawn, use "fork" multiprocessing.set_start_method("fork") - from grim import grim +import math +import string, copy - -# Profiler start def run( plan_b, config_file, @@ -32,22 +24,13 @@ def run( num_subjects=None, project_dir_graph="", ): - project_dir = "" - - # project_dir = ""# "../" - - # Read configuration file and load properties with open(config_file) as f: json_conf = json.load(f) - output_dir = json_conf.get("output_dir", "output") + output_dir = json_conf.get("output_dir", "data") graph_files_path = json_conf.get("graph_files_path") if graph_files_path[-1] != "/": graph_files_path += "/" - # output_dir = json_conf.get("imuptation_out_path", "output") - # output_dir = "output/" - # Read configuration file and load properties - config = { "planb": json_conf.get("planb", True), "pops": json_conf.get("populations"), @@ -57,56 +40,31 @@ def run( "number_of_pop_results": json_conf.get("number_of_pop_results", 100), "output_MUUG": json_conf.get("output_MUUG", False), "output_haplotypes": json_conf.get("output_haplotypes", True), - "node_file": project_dir_graph - + graph_files_path - + json_conf.get("node_csv_file"), - "top_links_file": project_dir_graph - + graph_files_path - + json_conf.get("top_links_csv_file"), - "edges_file": project_dir_graph - + graph_files_path - + json_conf.get("edges_csv_file"), + "node_file": project_dir_graph + graph_files_path + json_conf.get("node_csv_file"), + "top_links_file": project_dir_graph + graph_files_path + json_conf.get("top_links_csv_file"), + "edges_file": project_dir_graph + graph_files_path + json_conf.get("edges_csv_file"), "imputation_input_file": json_conf.get("imputation_in_file"), - "imputation_out_umug_freq_file": output_dir - + json_conf.get("imputation_out_umug_freq_filename", "None"), - "imputation_out_umug_pops_file": output_dir - + json_conf.get("imputation_out_umug_pops_filename", "None"), - "imputation_out_hap_freq_file": output_dir - + json_conf.get("imputation_out_hap_freq_filename"), - "imputation_out_hap_pops_file": output_dir - + json_conf.get("imputation_out_hap_pops_filename", "None"), - "imputation_out_miss_file": output_dir - + json_conf.get("imputation_out_miss_filename", "miss.txt"), - "imputation_out_problem_file": output_dir - + json_conf.get("imputation_out_problem_filename", "problem.txt"), + "imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename", "None"), + "imputation_out_umug_pops_file": output_dir + json_conf.get("imputation_out_umug_pops_filename", "None"), + "imputation_out_hap_freq_file": output_dir + json_conf.get("imputation_out_hap_freq_filename"), + "imputation_out_hap_pops_file": output_dir + json_conf.get("imputation_out_hap_pops_filename", "None"), + "imputation_out_miss_file": output_dir + json_conf.get("imputation_out_miss_filename", "miss.txt"), + "imputation_out_problem_file": output_dir + json_conf.get("imputation_out_problem_filename", "problem.txt"), "factor_missing_data": json_conf.get("factor_missing_data", 0.01), - "loci_map": json_conf.get( - "loci_map", {"A": "A", "B": "B", "C": "C", "DQB1": "Q", "DRB1": "R"} - ), - "loci_map": json_conf.get( - "loci_map", {"A": 1, "B": 3, "C": 2, "DQB1": 4, "DRB1": 5} - ), - "matrix_planb": json_conf.get( - "Plan_B_Matrix", - [ + "loci_map": json_conf.get("loci_map", {"A": 1, "B": 3, "C": 2, "DQB1": 4, "DRB1": 5}), + "matrix_planb": json_conf.get("Plan_B_Matrix", [ [[1, 2, 3, 4, 5]], [[1, 2, 3], [4, 5]], [[1], [2, 3], [4, 5]], [[1, 2, 3], [4], [5]], [[1], [2, 3], [4], [5]], [[1], [2], [3], [4], [5]], - ], - ), + ]), "pops_count_file": project_dir + json_conf.get("pops_count_file", ""), "use_pops_count_file": json_conf.get("pops_count_file", False), - "number_of_options_threshold": json_conf.get( - "number_of_options_threshold", 100000 - ), - "max_haplotypes_number_in_phase": json_conf.get( - "max_haplotypes_number_in_phase", 100 - ), - "bin_imputation_input_file": project_dir - + json_conf.get("bin_imputation_in_file", "None"), + "number_of_options_threshold": json_conf.get("number_of_options_threshold", 100000), + "max_haplotypes_number_in_phase": json_conf.get("max_haplotypes_number_in_phase", 100), + "bin_imputation_input_file": project_dir + json_conf.get("bin_imputation_in_file", "None"), "num_thread": json_conf.get("num_threads", 1), "nodes_for_plan_A": json_conf.get("Plan_A_Matrix", []), "save_mode": json_conf.get("save_space_mode", False), @@ -116,17 +74,12 @@ def run( for _, val in config["loci_map"].items(): all_loci_set.add(str(val)) config["full_loci"] = "".join(sorted(all_loci_set)) - config["imputation_out_hap_pops_file"] = ( - config["imputation_out_hap_pops_file"] + str(iteration) + ".txt" - ) + config["imputation_out_hap_pops_file"] = config["imputation_out_hap_pops_file"] + str(iteration) + ".txt" if pop: config["pops"] = pop - # Display the configurations we are using - print( - "****************************************************************************************************" - ) + print("****************************************************************************************************") print("Performing imputation based on:") print("\tPopulation: {}".format(config["pops"])) print("\tPriority: {}".format(config["priority"])) @@ -138,23 +91,11 @@ def run( print("\tTop Links File: {}".format(config["edges_file"])) print("\tInput File: {}".format(config["imputation_input_file"])) print("\tOutput UMUG Format: {}".format(config["output_MUUG"])) - print( - "\tOutput UMUG Freq Filename: {}".format( - config["imputation_out_umug_freq_file"] - ) - ) - print( - "\tOutput UMUG Pops Filename: {}".format( - config["imputation_out_umug_pops_file"] - ) - ) + print("\tOutput UMUG Freq Filename: {}".format(config["imputation_out_umug_freq_file"])) + print("\tOutput UMUG Pops Filename: {}".format(config["imputation_out_umug_pops_file"])) print("\tOutput Haplotype Format: {}".format(config["output_haplotypes"])) - print( - "\tOutput HAP Freq Filename: {}".format(config["imputation_out_hap_freq_file"]) - ) - print( - "\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"]) - ) + print("\tOutput HAP Freq Filename: {}".format(config["imputation_out_hap_freq_file"])) + print("\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"])) print("\tOutput Miss Filename: {}".format(config["imputation_out_miss_file"])) print("\tOutput Problem Filename: {}".format(config["imputation_out_problem_file"])) print("\tFactor Missing Data: {}".format(config["factor_missing_data"])) @@ -162,37 +103,22 @@ def run( print("\tPlan B Matrix: {}".format(config["matrix_planb"])) print("\tPops Count File: {}".format(config["pops_count_file"])) print("\tUse Pops Count File: {}".format(config["use_pops_count_file"])) - print( - "\tNumber of Options Threshold: {}".format( - config["number_of_options_threshold"] - ) - ) - print( - "\tMax Number of haplotypes in phase: {}".format( - config["max_haplotypes_number_in_phase"] - ) - ) + print("\tNumber of Options Threshold: {}".format(config["number_of_options_threshold"])) + print("\tMax Number of haplotypes in phase: {}".format(config["max_haplotypes_number_in_phase"])) if config["nodes_for_plan_A"]: print("\tNodes in plan A: {}".format(config["nodes_for_plan_A"])) print("\tSave space mode: {}".format(config["save_mode"])) - print( - "****************************************************************************************************" - ) + print("****************************************************************************************************") - # Perform imputation + # Create graph instance graph = grim.graph_instance(config) imputation_list = [] - - # Create output directory if it doesn't exist pathlib.Path(output_dir).mkdir(parents=False, exist_ok=True) input_file = config["imputation_input_file"] in_dir = os.path.dirname(input_file) if in_dir == "": in_dir = "." - # if project_dir_in_file != "": - # in_dir = "splited_data_for_em" - # pathlib.Path(in_dir).mkdir(parents=False, exist_ok=True) in_file_basename = os.path.basename(input_file) if not num_subjects: @@ -200,66 +126,73 @@ def run( num_subjects = int(num_subjects.strip().split(" ")[0]) + 1 print(num_subjects) - output_dir = config.get("graph_files_path", "output") + # Create output directory for split files + output_ct = "output_ct" + os.makedirs(output_ct, exist_ok=True) + # Use output_ct directory directly rather than combining with in_dir + split_prefix = os.path.join(output_ct, in_file_basename[0:2]) split_cmd = ( - "split -l " + "split -l " + str(int(math.ceil(num_subjects / config["num_thread"]))) + " " + input_file + " " - + in_dir - + "/" - + f"../{output_dir}/{in_file_basename[0:2]}" + + split_prefix ) print(f"Split Command: {split_cmd}") os.system(split_cmd) alpha = string.ascii_lowercase - therads_list = list() + threads_list = [] config_list = [] for i in range(config["num_thread"]): imputation = grim.impute_instance(config, graph, count_by_prob=count_by_prob) imputation_list.append(imputation) - in_file = ( - in_dir - + "/" - + f"../{output_dir}/{in_file_basename[0:2]}" - + alpha[int(i / 26)] - + alpha[int(i % 26)] - ) + # Construct the split file name using the split_prefix + in_file = split_prefix + alpha[int(i / 26)] + alpha[int(i % 26)] print(in_file) - output_file = output_dir + "/" + os.path.basename(in_file) + "_out" + output_file = os.path.join(output_ct, os.path.basename(in_file) + "_out") config_list.append(copy.deepcopy(config)) config_list[i]["imputation_input_file"] = in_file config_list[i]["imputation_out_hap_freq_file"] = output_file for i in range(config["num_thread"]): - t = Process( + t = multiprocessing.Process( target=imputation_list[i].impute_file, - args=( - config_list[i], - plan_b, - em_mr, - True, - ), + args=(config_list[i], plan_b, em_mr, True), ) - therads_list.append(t) - for i in range(config["num_thread"]): - therads_list[i].start() - for i in range(config["num_thread"]): - therads_list[i].join() - for i in range(config["num_thread"]): - therads_list[i].terminate() - - f_out = open(config["imputation_out_hap_freq_file"], "w") - for i in range(config["num_thread"]): - - with open(config_list[i]["imputation_out_hap_freq_file"]) as f_t_out: - for line in f_t_out: - f_out.write(line) - f_t_out.close() - os.remove(config_list[i]["imputation_out_hap_freq_file"]) - - # Profiler end - # pr.disable() - # pr.print_stats(sort="time") + threads_list.append(t) + for t in threads_list: + t.start() + for t in threads_list: + t.join() + for t in threads_list: + t.terminate() + + with open(config["imputation_out_hap_freq_file"], "w") as f_out: + for i in range(config["num_thread"]): + with open(config_list[i]["imputation_out_hap_freq_file"]) as f_t_out: + for line in f_t_out: + f_out.write(line) + f_t_out.close() + os.remove(config_list[i]["imputation_out_hap_freq_file"]) + + print("Starting cleanup of temporary split files.") + temp_file_prefix = in_file_basename[0:2] + temp_files_pattern = os.path.join(output_ct, temp_file_prefix + "*") + temp_files = glob.glob(temp_files_pattern) + if temp_files: + for temp_file in temp_files: + print(f"Removing temporary file: {temp_file}") + try: + os.remove(temp_file) + except Exception as e: + print(f"Error removing file {temp_file}: {e}") + print("All temporary files removed.") + else: + print("No temporary files found with pattern:", temp_files_pattern) + try: + os.rmdir(output_ct) + print(f"Temporary directory '{output_ct}' removed.") + except Exception as e: + print(f"Error removing directory '{output_ct}': {e}") diff --git a/conf/minimal-em-configuration.json b/conf/minimal-em-configuration.json index f9d338d..2895865 100644 --- a/conf/minimal-em-configuration.json +++ b/conf/minimal-em-configuration.json @@ -1,7 +1,8 @@ { "populations": [ - "AAFA", - "CAU" + "CAU", + "AAFA" + ], "FULL_LOCI": "ABCQR", @@ -13,12 +14,13 @@ "DQB1": 4, "DRB1": 5 }, - "graph_files_path": "data/" , - "node_csv_file": "nodes.csv", + "graph_files_path": "data/" , + "node_csv_file": "nodes.csv", "edges_csv_file": "edges.csv", "info_node_csv_file": "info_node.csv", "top_links_csv_file": "top_links.csv", - "imputation_in_file": "data/ct_mr_don_10.txt", + "imputation_in_file": "data/donors/donor.txt", + "ct_files": "data", "Plan_B_Matrix": [ [[1, 2, 3, 4, 5]], [[1, 2, 3], [4, 5]], @@ -30,9 +32,9 @@ "imputation_out_hap_freq_filename": "don.hap.freqs", - "freq_file": "data/hpf.csv", + "freq_file": "output/hpf.csv", - "priority": { + "priority": { "alpha": 0.4999999, "eta": 0, "beta": 1e-7, @@ -55,7 +57,10 @@ "num_threads": 1, "run_just_SR_EM": false, "freq_data_dir": "data/freqs", - "pops_count_file": "data/pop_counts_file.txt", - "output_dir": "data/" - + "pops_count_file": "output/pop_counts_file.txt", + "imputation_out_umug_freq_filename": "umug_freq", + "imputation_out_umug_pops_filename": "umug_pops", + "imputation_out_hap_pops_filename": "hap_pops", + "output_dir": "data/", + "output_ct": "output_ct" } diff --git a/data/ct_mr_don_10.txt b/data/donors/donor.txt similarity index 100% rename from data/ct_mr_don_10.txt rename to data/donors/donor.txt From 8a1c32da61daee45fb909bcc2458d37578fe485b Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Sun, 16 Feb 2025 17:01:53 +0200 Subject: [PATCH 04/12] updated requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4112647..4050952 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ toml==0.10.2 -py-graph-imputation==0.0.12 +py-graph-imputation>=0.0.12 From 89b7529fdbb397e13afab411f05d2f963acb74cd Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Sun, 16 Feb 2025 19:28:23 +0200 Subject: [PATCH 05/12] hopefully now we could import and use test_em when this repo is pip installed in other repos --- MANIFEST.in | 17 +++++++++++++ setup.py | 72 ++++++++++++++++++++++------------------------------- 2 files changed, 47 insertions(+), 42 deletions(-) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..dbf0cda --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,17 @@ +include LICENSE +include README.md +include HISTORY.rst +include requirements.txt +include requirements-tests.txt + +# Include test_em.py explicitly +include test_em.py + +# Include all files from the EM package and its subdirectories +recursive-include EM * + +# Ensure all non-Python files (e.g., data files) inside `EM` are included +recursive-include EM *.csv *.txt *.json *.yaml *.yml *.cfg + +# Include additional configuration files if applicable +include setup.cfg diff --git a/setup.py b/setup.py index fb2202a..496bde9 100644 --- a/setup.py +++ b/setup.py @@ -1,50 +1,35 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# -# py-graph-em Graph EM -# Copyright (c) 2021 Be The Match operated by National Marrow Donor Program. All Rights Reserved. -# -# This library is free software; you can redistribute it and/or modify it -# under the terms of the GNU Lesser General Public License as published -# by the Free Software Foundation; either version 3 of the License, or (at -# your option) any later version. -# -# This library is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -# License for more details. -# -# You should have received a copy of the GNU Lesser General Public License -# along with this library; if not, write to the Free Software Foundation, -# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -# -# > http://www.fsf.org/licensing/licenses/lgpl.html -# > http://www.opensource.org/licenses/lgpl-license.php -# - - -"""The setup script.""" - +import os from setuptools import setup, find_packages -with open("README.md") as readme_file: - readme = readme_file.read() +# Get the absolute path to this directory. +here = os.path.abspath(os.path.dirname(__file__)) -with open("HISTORY.rst") as history_file: - history = history_file.read() +# Read the contents of README.md for the long description. +with open(os.path.join(here, "README.md"), encoding="utf-8") as f: + readme = f.read() -with open("requirements.txt") as requirements_file: - requirements = requirements_file.read().split("\n") +# Try to read HISTORY.rst; if it doesn't exist, use an empty string. +try: + with open(os.path.join(here, "HISTORY.rst"), encoding="utf-8") as f: + history = f.read() +except FileNotFoundError: + history = "" -with open("requirements-tests.txt") as requirements_file: - test_requirements = requirements_file.read().split("\n") +# Read the requirements and test requirements files. +with open(os.path.join(here, "requirements.txt"), encoding="utf-8") as f: + requirements = f.read().splitlines() + +with open(os.path.join(here, "requirements-tests.txt"), encoding="utf-8") as f: + test_requirements = f.read().splitlines() setup( - name="py-graph-em", - version="0.0.3", - author="Pradeep Bashyal", - author_email="pbashyal@nmdp.org", + name="py-graph-EM", + version="0.0.1", + author="Regev Yehezkel Imra", + author_email="regevel2006@gmail.com", python_requires=">=3.8", classifiers=[ "Development Status :: 2 - Pre-Alpha", @@ -56,15 +41,18 @@ "Programming Language :: Python :: 3.10", ], description="Graph Based EM", - install_requires=requirements, - license="LGPL 3.0", long_description=readme + "\n\n" + history, long_description_content_type="text/markdown", - include_package_data=True, + install_requires=requirements, + license="LGPL 3.0", keywords="Graph, EM", - packages=find_packages(include=["EM"]), + # Explicitly include the 'EM' package and any subpackages. + packages=find_packages(include=["EM", "EM.*"]), + include_package_data=True, + # Include test_em.py as a script so it gets installed in the environment’s bin/ folder. + scripts=["test_em.py"], test_suite="tests", tests_require=test_requirements, - url="https://github.com/nmdp-bioinformatics/py-graph-em", + url="https://github.com/Regev32/py-graph-em", zip_safe=False, ) From 2afefc342c4f774fec937c82f05199e148e1eee9 Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Sun, 16 Feb 2025 19:42:43 +0200 Subject: [PATCH 06/12] hopefully now we could import and use test_em when this repo is pip installed in other repos --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 496bde9..e3316c6 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ keywords="Graph, EM", # Explicitly include the 'EM' package and any subpackages. packages=find_packages(include=["EM", "EM.*"]), + py_modules=["test_em"], include_package_data=True, # Include test_em.py as a script so it gets installed in the environment’s bin/ folder. scripts=["test_em.py"], From 25a0ca4745fe5b1b9ee11c9a3428aae9f0851fa3 Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Sun, 16 Feb 2025 19:53:14 +0200 Subject: [PATCH 07/12] Fix setup.py to include test_em.py as a module --- setup.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index e3316c6..13ba1a0 100644 --- a/setup.py +++ b/setup.py @@ -4,21 +4,17 @@ import os from setuptools import setup, find_packages -# Get the absolute path to this directory. here = os.path.abspath(os.path.dirname(__file__)) -# Read the contents of README.md for the long description. with open(os.path.join(here, "README.md"), encoding="utf-8") as f: readme = f.read() -# Try to read HISTORY.rst; if it doesn't exist, use an empty string. try: with open(os.path.join(here, "HISTORY.rst"), encoding="utf-8") as f: history = f.read() except FileNotFoundError: history = "" -# Read the requirements and test requirements files. with open(os.path.join(here, "requirements.txt"), encoding="utf-8") as f: requirements = f.read().splitlines() @@ -46,12 +42,11 @@ install_requires=requirements, license="LGPL 3.0", keywords="Graph, EM", - # Explicitly include the 'EM' package and any subpackages. + # Explicitly include the 'EM' package and its subpackages. packages=find_packages(include=["EM", "EM.*"]), - py_modules=["test_em"], include_package_data=True, - # Include test_em.py as a script so it gets installed in the environment’s bin/ folder. - scripts=["test_em.py"], + # Remove the scripts argument and include test_em.py as a module: + py_modules=["test_em"], test_suite="tests", tests_require=test_requirements, url="https://github.com/Regev32/py-graph-em", From fc614eba3cbd4acd397c2127eb04ce8fb12bacc8 Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Mon, 17 Feb 2025 12:11:29 +0200 Subject: [PATCH 08/12] Fix setup.py to include test_em.py as a module --- HISTORY.rst | 2 +- MANIFEST.in | 8 ++++---- __init__.py | 0 setup.py | 5 ++--- 4 files changed, 7 insertions(+), 8 deletions(-) create mode 100644 __init__.py diff --git a/HISTORY.rst b/HISTORY.rst index 93c23ae..5fc601f 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,7 +2,7 @@ History ======= -0.0.1 (2021-08-25) +0.0.1 (2025-02-16) ------------------ * First release on PyPI. diff --git a/MANIFEST.in b/MANIFEST.in index dbf0cda..429228c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,14 +4,14 @@ include HISTORY.rst include requirements.txt include requirements-tests.txt -# Include test_em.py explicitly +# Include test_em.py so that it's part of the source distribution. include test_em.py -# Include all files from the EM package and its subdirectories +# Include all files from the EM package and its subdirectories. recursive-include EM * -# Ensure all non-Python files (e.g., data files) inside `EM` are included +# Optionally, include specific non-Python file types from EM: recursive-include EM *.csv *.txt *.json *.yaml *.yml *.cfg -# Include additional configuration files if applicable +# Include additional configuration files if needed. include setup.cfg diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index 13ba1a0..2bd158f 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- import os @@ -42,10 +41,10 @@ install_requires=requirements, license="LGPL 3.0", keywords="Graph, EM", - # Explicitly include the 'EM' package and its subpackages. + # Include the main package and its subpackages. packages=find_packages(include=["EM", "EM.*"]), include_package_data=True, - # Remove the scripts argument and include test_em.py as a module: + # Install test_em.py as an importable module. py_modules=["test_em"], test_suite="tests", tests_require=test_requirements, From 5c2f693e9a378cfa391dcf803a382cec12e1a575 Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Tue, 18 Feb 2025 13:58:14 +0200 Subject: [PATCH 09/12] restored everything in order to pull --- MANIFEST.in | 17 ------------- setup.py | 69 +++++++++++++++++++++++++++++++++-------------------- 2 files changed, 43 insertions(+), 43 deletions(-) delete mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 429228c..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,17 +0,0 @@ -include LICENSE -include README.md -include HISTORY.rst -include requirements.txt -include requirements-tests.txt - -# Include test_em.py so that it's part of the source distribution. -include test_em.py - -# Include all files from the EM package and its subdirectories. -recursive-include EM * - -# Optionally, include specific non-Python file types from EM: -recursive-include EM *.csv *.txt *.json *.yaml *.yml *.cfg - -# Include additional configuration files if needed. -include setup.cfg diff --git a/setup.py b/setup.py index 2bd158f..fb2202a 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,50 @@ +#!/usr/bin/env python # -*- coding: utf-8 -*- -import os -from setuptools import setup, find_packages +# +# py-graph-em Graph EM +# Copyright (c) 2021 Be The Match operated by National Marrow Donor Program. All Rights Reserved. +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# + -here = os.path.abspath(os.path.dirname(__file__)) +"""The setup script.""" -with open(os.path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() +from setuptools import setup, find_packages -try: - with open(os.path.join(here, "HISTORY.rst"), encoding="utf-8") as f: - history = f.read() -except FileNotFoundError: - history = "" +with open("README.md") as readme_file: + readme = readme_file.read() -with open(os.path.join(here, "requirements.txt"), encoding="utf-8") as f: - requirements = f.read().splitlines() +with open("HISTORY.rst") as history_file: + history = history_file.read() -with open(os.path.join(here, "requirements-tests.txt"), encoding="utf-8") as f: - test_requirements = f.read().splitlines() +with open("requirements.txt") as requirements_file: + requirements = requirements_file.read().split("\n") + +with open("requirements-tests.txt") as requirements_file: + test_requirements = requirements_file.read().split("\n") setup( - name="py-graph-EM", - version="0.0.1", - author="Regev Yehezkel Imra", - author_email="regevel2006@gmail.com", + name="py-graph-em", + version="0.0.3", + author="Pradeep Bashyal", + author_email="pbashyal@nmdp.org", python_requires=">=3.8", classifiers=[ "Development Status :: 2 - Pre-Alpha", @@ -36,18 +56,15 @@ "Programming Language :: Python :: 3.10", ], description="Graph Based EM", - long_description=readme + "\n\n" + history, - long_description_content_type="text/markdown", install_requires=requirements, license="LGPL 3.0", - keywords="Graph, EM", - # Include the main package and its subpackages. - packages=find_packages(include=["EM", "EM.*"]), + long_description=readme + "\n\n" + history, + long_description_content_type="text/markdown", include_package_data=True, - # Install test_em.py as an importable module. - py_modules=["test_em"], + keywords="Graph, EM", + packages=find_packages(include=["EM"]), test_suite="tests", tests_require=test_requirements, - url="https://github.com/Regev32/py-graph-em", + url="https://github.com/nmdp-bioinformatics/py-graph-em", zip_safe=False, ) From 646bed8ed8fd89c75abe1e986b1243b66abfb89a Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Tue, 18 Feb 2025 14:05:16 +0200 Subject: [PATCH 10/12] deleted "produce_hpf" --- EM/run_em.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/EM/run_em.py b/EM/run_em.py index 1ad90c4..39e2f7f 100644 --- a/EM/run_em.py +++ b/EM/run_em.py @@ -15,43 +15,6 @@ # import shutil import os -def produce_hpf(conf_file): - project_dir = "" - - # Read configuration file and load properties - with open(conf_file) as f: - conf = json.load(f) - - pops = conf.get("populations") - freq_data_dir = project_dir + conf.get("freq_data_dir") - output_dir = project_dir + conf.get("graph_files_path") - pop_ratio_dir = project_dir + conf.get("pops_count_file") - - pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) - - list_pop_count = [] - - for pop in pops: - in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz" - with gzip.open(in_freq_file, "rb") as zf: - count_pop = 0 - lines = [x.decode("utf8").strip() for x in zf.readlines()] - for hap_line in lines: - haplotype, count, freq = hap_line.split(",") - if haplotype == "Haplo": - continue - if float(freq) == 0.0: - continue - count_pop += float(count) - list_pop_count.append(count_pop) - - sum_pops = sum(list_pop_count) - - with open(pop_ratio_dir, "w") as pop_ratio_file: - for pop, ratio in zip(pops, list_pop_count): - pop_ratio_file.write("{},{},{}\n".format(pop, ratio, ratio / sum_pops)) - print(f"Writing pop_counts_file to: {pop_ratio_dir}") - def run_em_def( conf_file, From c1aa7b903893dd7bd94e4c4186e0918601a93a4c Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Tue, 18 Feb 2025 14:10:36 +0200 Subject: [PATCH 11/12] preparing to pull --- HISTORY.rst | 2 +- conf/minimal-em-configuration.json | 26 +++++--------- filter.py | 58 ------------------------------ index.py | 42 ---------------------- 4 files changed, 10 insertions(+), 118 deletions(-) delete mode 100644 filter.py delete mode 100644 index.py diff --git a/HISTORY.rst b/HISTORY.rst index 5fc601f..93c23ae 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,7 +2,7 @@ History ======= -0.0.1 (2025-02-16) +0.0.1 (2021-08-25) ------------------ * First release on PyPI. diff --git a/conf/minimal-em-configuration.json b/conf/minimal-em-configuration.json index 2895865..dfdff2d 100644 --- a/conf/minimal-em-configuration.json +++ b/conf/minimal-em-configuration.json @@ -1,8 +1,7 @@ { "populations": [ - "CAU", - "AAFA" - + "AAFA", + "CAU" ], "FULL_LOCI": "ABCQR", @@ -14,13 +13,12 @@ "DQB1": 4, "DRB1": 5 }, - "graph_files_path": "data/" , - "node_csv_file": "nodes.csv", + "graph_files_path": "output/" , + "node_csv_file": "nodes.csv", "edges_csv_file": "edges.csv", "info_node_csv_file": "info_node.csv", "top_links_csv_file": "top_links.csv", - "imputation_in_file": "data/donors/donor.txt", - "ct_files": "data", + "imputation_in_file": "data/ct_mr_don_10.txt", "Plan_B_Matrix": [ [[1, 2, 3, 4, 5]], [[1, 2, 3], [4, 5]], @@ -34,7 +32,7 @@ "imputation_out_hap_freq_filename": "don.hap.freqs", "freq_file": "output/hpf.csv", - "priority": { + "priority": { "alpha": 0.4999999, "eta": 0, "beta": 1e-7, @@ -51,16 +49,10 @@ "init_cutoff": 100, "max_iterations": 50, - "logLikelihood_file": "data/log_likelihood.txt", + "logLikelihood_file": "output/log_likelihood.txt", "memory_min_size": 100000000, "memory_max_size": 600000000, "num_threads": 1, - "run_just_SR_EM": false, - "freq_data_dir": "data/freqs", - "pops_count_file": "output/pop_counts_file.txt", - "imputation_out_umug_freq_filename": "umug_freq", - "imputation_out_umug_pops_filename": "umug_pops", - "imputation_out_hap_pops_filename": "hap_pops", - "output_dir": "data/", - "output_ct": "output_ct" + "run_just_SR_EM": false + } diff --git a/filter.py b/filter.py deleted file mode 100644 index 214a180..0000000 --- a/filter.py +++ /dev/null @@ -1,58 +0,0 @@ -import csv -import os -import shutil - - -def remove(input_csv_path, strings_to_remove): - # Determine directory and backup file path. - directory = os.path.dirname(input_csv_path) - backup_csv_path = os.path.join(directory, "donor_becup.csv") - - # Create a backup copy of donor.csv - shutil.copy(input_csv_path, backup_csv_path) - print(f"Backup created: {backup_csv_path}") - - # Dictionary to count removals for each substring. - removal_counts = {substr: 0 for substr in strings_to_remove} - - updated_rows = [] - - # Read the original donor.csv - with open(input_csv_path, newline='', encoding='utf-8') as infile: - reader = csv.reader(infile) - for row in reader: - # Process only if the row has at least 2 columns. - if len(row) >= 2: - # Split the second column by '^' - elements = row[1].split('^') - new_elements = [] - for element in elements: - remove_flag = False - # Check if element contains any substring from strings_to_remove. - for substr in strings_to_remove: - if substr in element: - removal_counts[substr] += 1 - remove_flag = True - if not remove_flag: - new_elements.append(element) - # Reassemble the second column. - row[1] = "^".join(new_elements) - # Add (modified or not) the row to our updated_rows list. - updated_rows.append(row) - - # Overwrite donor.csv with the updated rows. - with open(input_csv_path, "w", newline='', encoding='utf-8') as outfile: - writer = csv.writer(outfile) - writer.writerows(updated_rows) - - # Print summary of removals for each substring. - print("\nRemoval summary:") - for substr, count in removal_counts.items(): - print(f"{substr}: removed {count}") - - -# Example usage: -if __name__ == "__main__": - donor_csv_path = "data/subjects/donor.csv" - strings_to_remove = ['DRBX', 'DRB3','DRB4','DRB5','DQA1','DPA1','DPB1'] - remove(donor_csv_path, strings_to_remove) diff --git a/index.py b/index.py deleted file mode 100644 index 104fbbf..0000000 --- a/index.py +++ /dev/null @@ -1,42 +0,0 @@ -import csv - - -def update_first_column_with_numbering(input_csv_path): - """ - Reads the CSV file from input_csv_path. - For each row, replaces the first column with a sequential number starting at 1. - Overwrites the original CSV with the updated rows. - - For example, if the original rows are: - hjsdh,... - kuayj,... - kjaha,... - husd,... - They will become: - 1,... - 2,... - 3,... - 4,... - """ - updated_rows = [] - - # Read all rows from the original CSV. - with open(input_csv_path, newline='', encoding='utf-8') as infile: - reader = csv.reader(infile) - for index, row in enumerate(reader, start=1): - if row: # Only process non-empty rows - row[0] = str(index) - updated_rows.append(row) - - # Overwrite the CSV with the updated rows. - with open(input_csv_path, "w", newline='', encoding='utf-8') as outfile: - writer = csv.writer(outfile) - writer.writerows(updated_rows) - - print(f"Updated the first column with numbering in {input_csv_path}.") - - -# Example usage: -if __name__ == "__main__": - donor_csv_path = "data/subjects/donor.csv" - update_first_column_with_numbering(donor_csv_path) From 321a9897d88a24f934092446ebea761a37b82b5a Mon Sep 17 00:00:00 2001 From: yehezkr7 Date: Tue, 18 Feb 2025 14:15:01 +0200 Subject: [PATCH 12/12] preparing to pull --- data/{donors/donor.txt => ct_mr_don_10.txt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data/{donors/donor.txt => ct_mr_don_10.txt} (100%) diff --git a/data/donors/donor.txt b/data/ct_mr_don_10.txt similarity index 100% rename from data/donors/donor.txt rename to data/ct_mr_don_10.txt