From 3c342d63066402f15fafa95544ac234ecf5c7ae4 Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Wed, 21 May 2025 11:24:18 -0700 Subject: [PATCH 01/11] add conda environment file --- environment.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 environment.yaml diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..4e3f73e --- /dev/null +++ b/environment.yaml @@ -0,0 +1,14 @@ +name: monopogen +channels: + - bioconda + - conda-forge +dependencies: + - python=3.10 + - pysam + - samtools + - bcftools + - tabix + - vcftools + - pandas + - numpy + - gzip From 47a611a1339589345b53101ee3090d7ff8d75461 Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Wed, 21 May 2025 11:25:04 -0700 Subject: [PATCH 02/11] improve test script portability --- test/runPreprocess.sh | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/test/runPreprocess.sh b/test/runPreprocess.sh index ec4e408..9bbcf30 100644 --- a/test/runPreprocess.sh +++ b/test/runPreprocess.sh @@ -1,9 +1,4 @@ -path="/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen" -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps +path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)" +# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps - -python ${path}/src/Monopogen.py preProcess -b bam.lst -o out -a ${path}/apps -t 8 - - - - +python ${path}/src/Monopogen.py preProcess -b ${path}/test/bam.lst -o out -a ${path}/apps -t 8 From c92a2e8ecbfe1231e2e3c661da24c3f32ae4dec3 Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Wed, 21 May 2025 11:25:27 -0700 Subject: [PATCH 03/11] remove hard coded path --- test/bam.lst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/bam.lst b/test/bam.lst index f63da1c..2bf3ace 100644 --- a/test/bam.lst +++ b/test/bam.lst @@ -1,2 +1,2 @@ -ID1,/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/example/A.bam -ID2,/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/example/B.bam +ID1,example/A.bam +ID2,example/B.bam From 1bbafffd50844ecc9066aa701c3e2006ea6fe2d7 Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Thu, 22 May 2025 14:43:11 -0700 Subject: [PATCH 04/11] add error reporting, remove binary link --- src/Monopogen.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/src/Monopogen.py b/src/Monopogen.py index 3ad8e7b..085896f 100644 --- a/src/Monopogen.py +++ b/src/Monopogen.py @@ -24,25 +24,15 @@ from multiprocessing import Pool -LIB_PATH = os.path.abspath( - os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib")) - -if LIB_PATH not in sys.path: - sys.path.insert(0, LIB_PATH) - PIPELINE_BASEDIR = os.path.dirname(os.path.realpath(sys.argv[0])) CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg") -#import pipelines -#from pipelines import get_cluster_cfgfile -#from pipelines import PipelineHandler - # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() handler.setFormatter(logging.Formatter( - '[{asctime}] {levelname:8s} {filename} {message}', style='{')) + '[{asctime}] {levelname:8s} {filename} {message}', style='{')) logger.addHandler(handler) @@ -124,12 +114,18 @@ def germline(args): if not args.norun == "TRUE": with Pool(processes=args.nthreads) as pool: - print(joblst) + logger.info("Starting germline jobs...") result = pool.map(runCMD, joblst) - #error_check(all = region_lst, output = result, step = "germline module") - - + failed_jobs = [job for job, status in zip(joblst, result) if status != 0] + if failed_jobs: + logger.error("The following jobs failed:") + for job in failed_jobs: + logger.error(job) + logger.error("Pipeline failed due to job errors. See logs for details.") + exit(1) + else: + logger.info("All germline jobs completed successfully.") def somatic(args): @@ -323,11 +319,11 @@ def main(): global out, samtools, bcftools, bgzip, java, beagle out = os.path.abspath(args.out) - samtools = os.path.abspath(args.app_path) + "/samtools" - bcftools = os.path.abspath(args.app_path) + "/bcftools" - bgzip = os.path.abspath(args.app_path) + "/bgzip" + samtools = "samtools" + bcftools = "bcftools" + bgzip = "bgzip" java = "java" - beagle = os.path.abspath(args.app_path) + "/beagle.27Jul16.86a.jar" + beagle = "beagle" args.func(args) From 9fc3bbda8713f3f2c7c4b31a6933b122f15c06b3 Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Fri, 23 May 2025 11:46:22 -0700 Subject: [PATCH 05/11] fix dependency check --- src/germline.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/germline.py b/src/germline.py index 847fc0a..2212f28 100644 --- a/src/germline.py +++ b/src/germline.py @@ -108,11 +108,11 @@ def validate_user_setting_germline(args): def check_dependencies(args): - programs_to_check = ("vcftools", "bgzip", "bcftools", "beagle.27Jul16.86a.jar","samtools","picard.jar", "java") + programs_to_check = ("vcftools", "bgzip", "bcftools", "samtools", "java", "beagle", "picard") - for prog in programs_to_check: - out = os.popen("command -v {}".format(args.app_path + "/" + prog)).read() - assert out != "", "Program {} cannot be found!".format(prog) + for prog in programs_to_check: + out = os.popen(f"command -v {prog}").read().strip() + assert out != "", f"Program {prog} cannot be found! Ensure it is installed and available in the PATH." # python_pkgs_to_check = ("drmaa",) From 6ec3766d32c2f7501ad0977c57d334b02b2ab418 Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Fri, 23 May 2025 11:46:45 -0700 Subject: [PATCH 06/11] fix Germline test file --- test/runGermline.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/runGermline.sh b/test/runGermline.sh index 20c4b08..3b6819c 100644 --- a/test/runGermline.sh +++ b/test/runGermline.sh @@ -1,14 +1,14 @@ -path="/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen" -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps +path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)" +# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps -python ${path}/src/Monopogen.py preProcess -b bam.lst -o out -a ${path}/apps -t 8 +# python ${path}/src/Monopogen.py preProcess -b bam.lst -o out -a ${path}/apps -t 8 python ${path}/src/Monopogen.py germline \ - -a ${path}/apps -t 8 -r region.lst \ - -p ../example/ \ - -g ../example/chr20_2Mb.hg38.fa -m 3 -s all -o out + -a ${path}/apps -t 8 -r ${path}/test/region.lst \ + -p ${path}/example/ \ + -g ${path}/example/chr20_2Mb.hg38.fa -m 3 -s all -o out From 962e2a9d3eee55ecb43c507248d57e7961f7a3a9 Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Fri, 23 May 2025 11:46:56 -0700 Subject: [PATCH 07/11] update conda yaml --- environment.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/environment.yaml b/environment.yaml index 4e3f73e..cb2d659 100644 --- a/environment.yaml +++ b/environment.yaml @@ -3,9 +3,11 @@ channels: - bioconda - conda-forge dependencies: - - python=3.10 - - pysam + - python>=3.10 + - picard + - beagle=4.1 - samtools + - pysam - bcftools - tabix - vcftools From 78786949e30334c0e5cd9c05c284c5fc3bbcb8cc Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Fri, 23 May 2025 13:39:43 -0700 Subject: [PATCH 08/11] update cmd to bcftools update error reporting function --- src/Monopogen.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Monopogen.py b/src/Monopogen.py index 085896f..64eaf9a 100644 --- a/src/Monopogen.py +++ b/src/Monopogen.py @@ -84,13 +84,13 @@ def germline(args): imputation_vcf = args.imputation_panel + "CCDG_14151_B01_GRM_WGS_2020-08-05_" + record[0] + ".filtered.shapeit2-duohmm-phased.vcf.gz" - cmd1 = samtools + " mpileup -b" + bam_filter + " -f " + args.reference + " -r " + jobid + " -q 20 -Q 20 --incl-flags 0 --excl-flags 0 -t DP -d 10000000 -v " - cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + ' filter -e \'REF !~ "^[ATGC]$"\' | ' + bcftools + " norm -m-both -f " + args.reference - cmd1 = cmd1 + " | grep -v \"\" | " + bgzip + " -c > " + args.out + "/germline/" + jobid + ".gl.vcf.gz" + cmd1 = f"{bcftools} mpileup -b {bam_filter} -f {args.reference} -r {jobid} -q 20 -Q 20 --annotate FORMAT/DP -d 10000000 | {bcftools} call -mv -Oz -o {args.out}/germline/{jobid}.gl.vcf.gz" + #cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + ' filter -e \'REF !~ "^[ATGC]$"\' | ' + bcftools + " norm -m-both -f " + args.reference + #cmd1 = cmd1 + " | grep -v \"\" | " + bgzip + " -c > " + args.out + "/germline/" + jobid + ".gl.vcf.gz" #cmd2 = bcftools + " view " + out + "/germline/" + jobid + ".gl.vcf.gz" + " -i 'FORMAT/DP>1' | " + bcftools + " call -cv | " + bgzip + " -c > " + args.out + "/SCvarCall/" + jobid + ".gt.vcf.gz" - cmd3 = java + " -Xmx20g -jar " + beagle + " gl=" + out + "/germline/" + jobid + ".gl.vcf.gz" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid + ".gp " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" + cmd3 = beagle + " gl=" + out + "/germline/" + jobid + ".gl.vcf.gz" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid + ".gp " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" - cmd5 = java + " -Xmx20g -jar " + beagle + " gt=" + out + "/germline/" + jobid + ".germline.vcf" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid+ ".phased " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" + cmd5 = beagle + " gt=" + out + "/germline/" + jobid + ".germline.vcf" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid+ ".phased " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" cmd5 = cmd5 + "\n" + "rm " + out + "/germline/" + jobid + ".germline.vcf" f_out = open(out + "/Script/runGermline_" + jobid + ".sh","w") if args.step == "varScan" or args.step == "all": @@ -116,12 +116,12 @@ def germline(args): with Pool(processes=args.nthreads) as pool: logger.info("Starting germline jobs...") result = pool.map(runCMD, joblst) - + # Check for errors in the results failed_jobs = [job for job, status in zip(joblst, result) if status != 0] if failed_jobs: logger.error("The following jobs failed:") for job in failed_jobs: - logger.error(job) + logger.error(job, status) logger.error("Pipeline failed due to job errors. See logs for details.") exit(1) else: @@ -225,6 +225,10 @@ def preProcess(args): bamlist.close() +def runCMD(cmd): + result = subprocess.run(cmd, shell=True) + return result.returncode + def main(): parser = argparse.ArgumentParser( description="""Monopogen: SNV calling from single cell sequencing From ca42fdab32bd86a40cfc845a0a8436392195e3a2 Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Fri, 23 May 2025 15:58:33 -0700 Subject: [PATCH 09/11] remove link to custom binary --- src/bamProcess.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bamProcess.py b/src/bamProcess.py index d6c3e9f..9855dd7 100644 --- a/src/bamProcess.py +++ b/src/bamProcess.py @@ -16,11 +16,11 @@ import pandas as pd from pysam import VariantFile -LIB_PATH = os.path.abspath( - os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib")) +# LIB_PATH = os.path.abspath( +# os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib")) -if LIB_PATH not in sys.path: - sys.path.insert(0, LIB_PATH) +# if LIB_PATH not in sys.path: +# sys.path.insert(0, LIB_PATH) PIPELINE_BASEDIR = os.path.dirname(os.path.realpath(sys.argv[0])) CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg") From b190b3dc0b209ca27de958654e4ee007353f74d7 Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Fri, 23 May 2025 16:05:04 -0700 Subject: [PATCH 10/11] add back the preprocess cmd --- test/runGermline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runGermline.sh b/test/runGermline.sh index 3b6819c..ce3774e 100644 --- a/test/runGermline.sh +++ b/test/runGermline.sh @@ -2,7 +2,7 @@ path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)" # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps -# python ${path}/src/Monopogen.py preProcess -b bam.lst -o out -a ${path}/apps -t 8 +python ${path}/src/Monopogen.py preProcess -b bam.lst -o out -a ${path}/apps -t 8 python ${path}/src/Monopogen.py germline \ From 795307315427effc4694ee825ebada62698d4e4a Mon Sep 17 00:00:00 2001 From: Shiyi Yin Date: Fri, 23 May 2025 16:05:42 -0700 Subject: [PATCH 11/11] improve protablity of test script --- test/runSomatic.sh | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/test/runSomatic.sh b/test/runSomatic.sh index f276c26..90759f8 100644 --- a/test/runSomatic.sh +++ b/test/runSomatic.sh @@ -12,37 +12,32 @@ -path="/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen_v1.5" -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps +path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)" - python ${path}/src/Monopogen.py preProcess -b bam.somatic.lst -o somatic -a ${path}/apps -t 8 - -python ${path}/src/Monopogen.py germline \ - -a ${path}/apps -t 8 -r region.lst \ - -p ../example/ \ - -g ../example/chr20_2Mb.hg38.fa -m 3 -s all -o somatic +# python ${path}/src/Monopogen.py preProcess -b bam.somatic.lst -o somatic -a ${path}/apps -t 8 +# python ${path}/src/Monopogen.py germline \ +# -a ${path}/apps -t 8 -r ${path}/test/region.lst \ +# -p ${path}/example/ \ +# -g ${path}/example/chr20_2Mb.hg38.fa -m 3 -s all -o out python ${path}/src/Monopogen.py somatic \ - -a ${path}/apps -r region.lst -t 22 \ - -i somatic -l test.csv -s featureInfo \ - -g ../example/chr20_2Mb.hg38.fa - - + -a ${path}/apps -r ${path}/test/region.lst -t 22 \ + -i out -l ${path}/test.csv -s featureInfo \ + -g ${path}/example/chr20_2Mb.hg38.fa python ${path}/src/Monopogen.py somatic \ - -a ${path}/apps -r region.lst -t 22 \ - -i somatic -l test.csv -s cellScan \ - -g ../example/chr20_2Mb.hg38.fa + -a ${path}/apps -r ${path}/test/region.lst -t 22 \ + -i out -l ${path}/test.csv -s cellScan \ + -g ${path}/example/chr20_2Mb.hg38.fa ####### Not work due to fewer marker size module load R python ${path}/src/Monopogen.py somatic \ - -a ${path}/apps -r region.lst -t 22 \ - -i somatic -l test.csv -s LDrefinement \ - -g ../example/chr20_2Mb.hg38.fa - + -a ${path}/apps -r ${path}/test/region.lst -t 22 \ + -i out -l ${path}/test.csv -s LDrefinement \ + -g ${path}/example/chr20_2Mb.hg38.fa