diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..cb2d659 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,16 @@ +name: monopogen +channels: + - bioconda + - conda-forge +dependencies: + - python>=3.10 + - picard + - beagle=4.1 + - samtools + - pysam + - bcftools + - tabix + - vcftools + - pandas + - numpy + - gzip diff --git a/src/Monopogen.py b/src/Monopogen.py index 3ad8e7b..64eaf9a 100644 --- a/src/Monopogen.py +++ b/src/Monopogen.py @@ -24,25 +24,15 @@ from multiprocessing import Pool -LIB_PATH = os.path.abspath( - os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib")) - -if LIB_PATH not in sys.path: - sys.path.insert(0, LIB_PATH) - PIPELINE_BASEDIR = os.path.dirname(os.path.realpath(sys.argv[0])) CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg") -#import pipelines -#from pipelines import get_cluster_cfgfile -#from pipelines import PipelineHandler - # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() handler.setFormatter(logging.Formatter( - '[{asctime}] {levelname:8s} {filename} {message}', style='{')) + '[{asctime}] {levelname:8s} {filename} {message}', style='{')) logger.addHandler(handler) @@ -94,13 +84,13 @@ def germline(args): imputation_vcf = args.imputation_panel + "CCDG_14151_B01_GRM_WGS_2020-08-05_" + record[0] + ".filtered.shapeit2-duohmm-phased.vcf.gz" - cmd1 = samtools + " mpileup -b" + bam_filter + " -f " + args.reference + " -r " + jobid + " -q 20 -Q 20 --incl-flags 0 --excl-flags 0 -t DP -d 10000000 -v " - cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + ' filter -e \'REF !~ "^[ATGC]$"\' | ' + bcftools + " norm -m-both -f " + args.reference - cmd1 = cmd1 + " | grep -v \"\" | " + bgzip + " -c > " + args.out + "/germline/" + jobid + ".gl.vcf.gz" + cmd1 = f"{bcftools} mpileup -b {bam_filter} -f {args.reference} -r {jobid} -q 20 -Q 20 --annotate FORMAT/DP -d 10000000 | {bcftools} call -mv -Oz -o {args.out}/germline/{jobid}.gl.vcf.gz" + #cmd1 = cmd1 + " | " + bcftools + " view " + " | " + bcftools + ' filter -e \'REF !~ "^[ATGC]$"\' | ' + bcftools + " norm -m-both -f " + args.reference + #cmd1 = cmd1 + " | grep -v \"\" | " + bgzip + " -c > " + args.out + "/germline/" + jobid + ".gl.vcf.gz" #cmd2 = bcftools + " view " + out + "/germline/" + jobid + ".gl.vcf.gz" + " -i 'FORMAT/DP>1' | " + bcftools + " call -cv | " + bgzip + " -c > " + args.out + "/SCvarCall/" + jobid + ".gt.vcf.gz" - cmd3 = java + " -Xmx20g -jar " + beagle + " gl=" + out + "/germline/" + jobid + ".gl.vcf.gz" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid + ".gp " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" + cmd3 = beagle + " gl=" + out + "/germline/" + jobid + ".gl.vcf.gz" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid + ".gp " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" - cmd5 = java + " -Xmx20g -jar " + beagle + " gt=" + out + "/germline/" + jobid + ".germline.vcf" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid+ ".phased " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" + cmd5 = beagle + " gt=" + out + "/germline/" + jobid + ".germline.vcf" + " ref=" + imputation_vcf + " chrom=" + record[0] + " out=" + out + "/germline/" + jobid+ ".phased " + "impute=false modelscale=2 nthreads=24 gprobs=true niterations=0" cmd5 = cmd5 + "\n" + "rm " + out + "/germline/" + jobid + ".germline.vcf" f_out = open(out + "/Script/runGermline_" + jobid + ".sh","w") if args.step == "varScan" or args.step == "all": @@ -124,12 +114,18 @@ def germline(args): if not args.norun == "TRUE": with Pool(processes=args.nthreads) as pool: - print(joblst) + logger.info("Starting germline jobs...") result = pool.map(runCMD, joblst) - #error_check(all = region_lst, output = result, step = "germline module") - - - + # Check for errors in the results + failed_jobs = [job for job, status in zip(joblst, result) if status != 0] + if failed_jobs: + logger.error("The following jobs failed:") + for job in failed_jobs: + logger.error(job, status) + logger.error("Pipeline failed due to job errors. See logs for details.") + exit(1) + else: + logger.info("All germline jobs completed successfully.") def somatic(args): @@ -229,6 +225,10 @@ def preProcess(args): bamlist.close() +def runCMD(cmd): + result = subprocess.run(cmd, shell=True) + return result.returncode + def main(): parser = argparse.ArgumentParser( description="""Monopogen: SNV calling from single cell sequencing @@ -323,11 +323,11 @@ def main(): global out, samtools, bcftools, bgzip, java, beagle out = os.path.abspath(args.out) - samtools = os.path.abspath(args.app_path) + "/samtools" - bcftools = os.path.abspath(args.app_path) + "/bcftools" - bgzip = os.path.abspath(args.app_path) + "/bgzip" + samtools = "samtools" + bcftools = "bcftools" + bgzip = "bgzip" java = "java" - beagle = os.path.abspath(args.app_path) + "/beagle.27Jul16.86a.jar" + beagle = "beagle" args.func(args) diff --git a/src/bamProcess.py b/src/bamProcess.py index d6c3e9f..9855dd7 100644 --- a/src/bamProcess.py +++ b/src/bamProcess.py @@ -16,11 +16,11 @@ import pandas as pd from pysam import VariantFile -LIB_PATH = os.path.abspath( - os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib")) +# LIB_PATH = os.path.abspath( +# os.path.join(os.path.dirname(os.path.realpath(__file__)), "pipelines/lib")) -if LIB_PATH not in sys.path: - sys.path.insert(0, LIB_PATH) +# if LIB_PATH not in sys.path: +# sys.path.insert(0, LIB_PATH) PIPELINE_BASEDIR = os.path.dirname(os.path.realpath(sys.argv[0])) CFG_DIR = os.path.join(PIPELINE_BASEDIR, "cfg") diff --git a/src/germline.py b/src/germline.py index 847fc0a..2212f28 100644 --- a/src/germline.py +++ b/src/germline.py @@ -108,11 +108,11 @@ def validate_user_setting_germline(args): def check_dependencies(args): - programs_to_check = ("vcftools", "bgzip", "bcftools", "beagle.27Jul16.86a.jar","samtools","picard.jar", "java") + programs_to_check = ("vcftools", "bgzip", "bcftools", "samtools", "java", "beagle", "picard") - for prog in programs_to_check: - out = os.popen("command -v {}".format(args.app_path + "/" + prog)).read() - assert out != "", "Program {} cannot be found!".format(prog) + for prog in programs_to_check: + out = os.popen(f"command -v {prog}").read().strip() + assert out != "", f"Program {prog} cannot be found! Ensure it is installed and available in the PATH." # python_pkgs_to_check = ("drmaa",) diff --git a/test/bam.lst b/test/bam.lst index f63da1c..2bf3ace 100644 --- a/test/bam.lst +++ b/test/bam.lst @@ -1,2 +1,2 @@ -ID1,/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/example/A.bam -ID2,/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen/example/B.bam +ID1,example/A.bam +ID2,example/B.bam diff --git a/test/runGermline.sh b/test/runGermline.sh index 20c4b08..ce3774e 100644 --- a/test/runGermline.sh +++ b/test/runGermline.sh @@ -1,14 +1,14 @@ -path="/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen" -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps +path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)" +# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps python ${path}/src/Monopogen.py preProcess -b bam.lst -o out -a ${path}/apps -t 8 python ${path}/src/Monopogen.py germline \ - -a ${path}/apps -t 8 -r region.lst \ - -p ../example/ \ - -g ../example/chr20_2Mb.hg38.fa -m 3 -s all -o out + -a ${path}/apps -t 8 -r ${path}/test/region.lst \ + -p ${path}/example/ \ + -g ${path}/example/chr20_2Mb.hg38.fa -m 3 -s all -o out diff --git a/test/runPreprocess.sh b/test/runPreprocess.sh index ec4e408..9bbcf30 100644 --- a/test/runPreprocess.sh +++ b/test/runPreprocess.sh @@ -1,9 +1,4 @@ -path="/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen" -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps +path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)" +# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps - -python ${path}/src/Monopogen.py preProcess -b bam.lst -o out -a ${path}/apps -t 8 - - - - +python ${path}/src/Monopogen.py preProcess -b ${path}/test/bam.lst -o out -a ${path}/apps -t 8 diff --git a/test/runSomatic.sh b/test/runSomatic.sh index f276c26..90759f8 100644 --- a/test/runSomatic.sh +++ b/test/runSomatic.sh @@ -12,37 +12,32 @@ -path="/rsrch3/scratch/bcb/jdou1/scAncestry/Monopogen_v1.5" -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${path}/apps +path="$(cd "$(dirname "${BASH_SOURCE[0]}")/../" && pwd)" - python ${path}/src/Monopogen.py preProcess -b bam.somatic.lst -o somatic -a ${path}/apps -t 8 - -python ${path}/src/Monopogen.py germline \ - -a ${path}/apps -t 8 -r region.lst \ - -p ../example/ \ - -g ../example/chr20_2Mb.hg38.fa -m 3 -s all -o somatic +# python ${path}/src/Monopogen.py preProcess -b bam.somatic.lst -o somatic -a ${path}/apps -t 8 +# python ${path}/src/Monopogen.py germline \ +# -a ${path}/apps -t 8 -r ${path}/test/region.lst \ +# -p ${path}/example/ \ +# -g ${path}/example/chr20_2Mb.hg38.fa -m 3 -s all -o out python ${path}/src/Monopogen.py somatic \ - -a ${path}/apps -r region.lst -t 22 \ - -i somatic -l test.csv -s featureInfo \ - -g ../example/chr20_2Mb.hg38.fa - - + -a ${path}/apps -r ${path}/test/region.lst -t 22 \ + -i out -l ${path}/test.csv -s featureInfo \ + -g ${path}/example/chr20_2Mb.hg38.fa python ${path}/src/Monopogen.py somatic \ - -a ${path}/apps -r region.lst -t 22 \ - -i somatic -l test.csv -s cellScan \ - -g ../example/chr20_2Mb.hg38.fa + -a ${path}/apps -r ${path}/test/region.lst -t 22 \ + -i out -l ${path}/test.csv -s cellScan \ + -g ${path}/example/chr20_2Mb.hg38.fa ####### Not work due to fewer marker size module load R python ${path}/src/Monopogen.py somatic \ - -a ${path}/apps -r region.lst -t 22 \ - -i somatic -l test.csv -s LDrefinement \ - -g ../example/chr20_2Mb.hg38.fa - + -a ${path}/apps -r ${path}/test/region.lst -t 22 \ + -i out -l ${path}/test.csv -s LDrefinement \ + -g ${path}/example/chr20_2Mb.hg38.fa