Merge remote-tracking branch 'upstream/dev' into Dump

maxulysse · maxulysse · commit 5898973216d7 · 2019-04-01T16:07:55.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,24 +7,37 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 
+### `Changed`
+
+-   [#744](https://github.com/SciLifeLab/Sarek/pull/744) - Refactor `germlineVC.nf`
+-   [#776](https://github.com/SciLifeLab/Sarek/pull/776) - Helper script now download annotations for VEP CADD plugin
+
 ### `Added`
 
 -   [#753](https://github.com/SciLifeLab/Sarek/pull/753) - Update `binac`, `cfc` configuration
 -   [#766](https://github.com/SciLifeLab/Sarek/pull/766) - Added `ps` in `r-base` and `runallelecount` containers
+-   [#774](https://github.com/SciLifeLab/Sarek/pull/774) - Autogenerates memory requirements from MarkDuplicates when less that 8G is available. cf [nf-core/rnaseq#179](https://github.com/nf-core/rnaseq/pull/179)
+-   [#775](https://github.com/SciLifeLab/Sarek/pull/775) - Update paths for munin configuration
+-   [#777](https://github.com/SciLifeLab/Sarek/pull/777) - Add GeneSplicer `1.0` to container
+-   [#777](https://github.com/SciLifeLab/Sarek/pull/777) - Add possibility to use VEP GeneSplicer plugin
+-   [#777](https://github.com/SciLifeLab/Sarek/pull/777) - Add `removeVCF()` function to remove `.ann`, `.gz` and `.vcf` from a VCF filename
 
 ### `Changed`
 
 -   [#741](https://github.com/SciLifeLab/Sarek/pull/741) - Use [dump](https://www.nextflow.io/docs/latest/operator.html#dump) operator
 -   [#744](https://github.com/SciLifeLab/Sarek/pull/744) - Refactor `germlineVC.nf`
+-   [#776](https://github.com/SciLifeLab/Sarek/pull/776) - Helper script now download annotations for VEP CADD plugin
 
 ### `Fixed`
 
+-   [#747](https://github.com/SciLifeLab/Sarek/pull/747) - Exclude Manta `*candidateSV.vcf` from annotation
 -   [#749](https://github.com/SciLifeLab/Sarek/pull/749) - Fix config problematic use of queue `core` for uppmax-slurm
 -   [#760](https://github.com/SciLifeLab/Sarek/pull/749) - Fix undefined `task.mem`
 -   [#751](https://github.com/SciLifeLab/Sarek/pull/751), [#756](https://github.com/SciLifeLab/Sarek/pull/756) - Typos in `igenomes.config`
 -   [#757](https://github.com/SciLifeLab/Sarek/pull/757) - Typos in `binac`, `cfc` configuration
 -   [#758](https://github.com/SciLifeLab/Sarek/pull/758) - Typos in `ASCAT` documentation
 -   [#765](https://github.com/SciLifeLab/Sarek/pull/765) - Check only for references that are needed to fix [#754](https://github.com/SciLifeLab/Sarek/issues/754)
+-   [#777](https://github.com/SciLifeLab/Sarek/pull/777) - Fix name collision in `annotate.nf`
 
 ### `Deprecated`
 
diff --git a/annotate.nf b/annotate.nf
@@ -76,7 +76,7 @@ if (annotateVCF == []) {
   Channel.empty().mix(
     Channel.fromPath("${params.outDir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz")
       .flatten().map{vcf -> ['haplotypecaller', vcf.minus(vcf.fileName)[-2].toString(), vcf]},
-    Channel.fromPath("${params.outDir}/VariantCalling/*/Manta/*SV.vcf.gz")
+    Channel.fromPath("${params.outDir}/VariantCalling/*/Manta/*[!candidate]SV.vcf.gz")
       .flatten().map{vcf -> ['manta', vcf.minus(vcf.fileName)[-2].toString(), vcf]},
     Channel.fromPath("${params.outDir}/VariantCalling/*/MuTect2/*.vcf.gz")
       .flatten().map{vcf -> ['mutect2', vcf.minus(vcf.fileName)[-2].toString(), vcf]},
@@ -104,7 +104,7 @@ vcfForVep = vcfForVep.map {
 }
 
 process RunBcftoolsStats {
-  tag {"${idPatient} - ${vcf}"}
+  tag {"${idPatient} - ${variantCaller} - ${vcf}"}
 
   publishDir "${params.outDir}/Reports/BCFToolsStats", mode: params.publishDirMode
 
@@ -130,11 +130,13 @@ process RunVcftools {
     set variantCaller, idPatient, file(vcf) from vcfForVCFtools
 
   output:
-    file ("${vcf.simpleName}.*") into vcfReport
+    file ("${reducedVCF}.*") into vcfReport
 
   when: !params.noReports
 
-  script: QC.vcftools(vcf)
+  script:
+    reducedVCF = SarekUtils.reduceVCF(vcf)
+    QC.vcftools(vcf)
 }
 
 vcfReport = vcfReport.dump(tag:'VCFTools')
@@ -143,7 +145,7 @@ process RunSnpeff {
   tag {"${idPatient} - ${variantCaller} - ${vcf}"}
 
   publishDir params.outDir, mode: params.publishDirMode, saveAs: {
-    if (it == "${vcf.simpleName}_snpEff.ann.vcf") null
+    if (it == "${reducedVCF}_snpEff.ann.vcf") null
     else "Annotation/${idPatient}/snpEff/${it}"
   }
 
@@ -153,25 +155,26 @@ process RunSnpeff {
     val snpeffDb from Channel.value(params.genomes[params.genome].snpeffDb)
 
   output:
-    set file("${vcf.simpleName}_snpEff.genes.txt"), file("${vcf.simpleName}_snpEff.csv"), file("${vcf.simpleName}_snpEff.summary.html") into snpeffOutput
-    set val("snpEff"), variantCaller, idPatient, file("${vcf.simpleName}_snpEff.ann.vcf") into snpeffVCF
+    set file("${reducedVCF}_snpEff.genes.txt"), file("${reducedVCF}_snpEff.csv"), file("${reducedVCF}_snpEff.summary.html") into snpeffOutput
+    set val("snpEff"), variantCaller, idPatient, file("${reducedVCF}_snpEff.ann.vcf") into snpeffVCF
 
   when: 'snpeff' in tools || 'merge' in tools
 
   script:
+  reducedVCF = SarekUtils.reduceVCF(vcf)
   cache = (params.snpEff_cache && params.annotation_cache) ? "-dataDir \${PWD}/${dataDir}" : ""
   """
   snpEff -Xmx${task.memory.toGiga()}g \
   ${snpeffDb} \
-  -csvStats ${vcf.simpleName}_snpEff.csv \
+  -csvStats ${reducedVCF}_snpEff.csv \
   -nodownload \
   ${cache} \
   -canon \
   -v \
   ${vcf} \
-  > ${vcf.simpleName}_snpEff.ann.vcf
+  > ${reducedVCF}_snpEff.ann.vcf
 
-  mv snpEff_summary.html ${vcf.simpleName}_snpEff.summary.html
+  mv snpEff_summary.html ${reducedVCF}_snpEff.summary.html
   """
 }
 
@@ -194,7 +197,7 @@ process RunVEP {
   tag {"${idPatient} - ${variantCaller} - ${vcf}"}
 
   publishDir params.outDir, mode: params.publishDirMode, saveAs: {
-    if (it == "${vcf.simpleName}_VEP.summary.html") "Annotation/${idPatient}/VEP/${it}"
+    if (it == "${reducedVCF}_VEP.summary.html") "Annotation/${idPatient}/VEP/${it}"
     else null
   }
 
@@ -210,34 +213,40 @@ process RunVEP {
     ])
 
   output:
-    set finalAnnotator, variantCaller, idPatient, file("${vcf.simpleName}_VEP.ann.vcf") into vepVCF
-    file("${vcf.simpleName}_VEP.summary.html") into vepReport
+    set finalAnnotator, variantCaller, idPatient, file("${reducedVCF}_VEP.ann.vcf") into vepVCF
+    file("${reducedVCF}_VEP.summary.html") into vepReport
 
   when: 'vep' in tools || 'merge' in tools
 
   script:
+  reducedVCF = SarekUtils.reduceVCF(vcf)
   finalAnnotator = annotator == "snpEff" ? 'merge' : 'VEP'
   genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome
   dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep"
   cadd = (params.cadd_cache && params.cadd_WG_SNVs && params.cadd_InDels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : ""
+  genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/sarek-2.3/bin/genesplicer,/opt/conda/envs/sarek-2.3/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline"
   """
+  mkdir ${reducedVCF}
+
   vep \
   -i ${vcf} \
-  -o ${vcf.simpleName}_VEP.ann.vcf \
+  -o ${reducedVCF}_VEP.ann.vcf \
   --assembly ${genome} \
   ${cadd} \
+  ${genesplicer} \
   --cache \
   --cache_version ${cache_version} \
   --dir_cache ${dir_cache} \
   --everything \
   --filter_common \
   --fork ${task.cpus} \
   --format vcf \
-  --offline \
   --per_gene \
-  --stats_file ${vcf.simpleName}_VEP.summary.html \
+  --stats_file ${reducedVCF}_VEP.summary.html \
   --total_length \
   --vcf
+
+  rm -rf ${reducedVCF}
   """
 }
 
@@ -257,6 +266,7 @@ process CompressVCF {
     set annotator, variantCaller, idPatient, file("*.vcf.gz"), file("*.vcf.gz.tbi") into (vcfCompressed, vcfCompressedoutput)
 
   script:
+  reducedVCF = SarekUtils.reduceVCF(vcf)
   finalAnnotator = annotator == "merge" ? "VEP" : annotator
   """
   bgzip < ${vcf} > ${vcf}.gz
diff --git a/build.nf b/build.nf
@@ -332,9 +332,10 @@ process BuildCache_VEP {
 }
 
 caddFileToDownload = (params.cadd_version) && (params.genome == "GRCh37" || params.genome == "GRCh38") ?
-  Channel.from("https://krishna.gs.washington.edu/download/CADD/${params.cadd_version}/${params.genome}/InDels.tsv.gz",
-    "https://krishna.gs.washington.edu/download/CADD/${params.cadd_version}/${params.genome}/whole_genome_SNVs.tsv.gz")
-  : Channel.empty()
+  Channel.from(
+    "https://krishna.gs.washington.edu/download/CADD/${params.cadd_version}/${params.genome}/InDels_inclAnno.tsv.gz",
+    "https://krishna.gs.washington.edu/download/CADD/${params.cadd_version}/${params.genome}/whole_genome_SNVs_inclAnno.tsv.gz"
+  ) : Channel.empty()
 
 process DownloadCADD {
   tag {caddFile}
@@ -352,7 +353,7 @@ process DownloadCADD {
   script:
   """
   wget --quiet ${caddFile}
-  tabix *.tsv.gz
+  wget --quiet ${caddFile}.tbi
   """
 }
 
diff --git a/conf/munin.config b/conf/munin.config
@@ -11,16 +11,16 @@ env {
 }
 
 params {
-  cadd_InDels      = "/data0/btb/cache/CADD/${params.genome}/InDels.tsv.gz"
-  cadd_InDels_tbi  = "/data0/btb/cache/CADD/${params.genome}/InDels.tsv.gz.tbi"
-  cadd_WG_SNVs     = "/data0/btb/cache/CADD/${params.genome}/whole_genome_SNVs.tsv.gz"
-  cadd_WG_SNVs_tbi = "/data0/btb/cache/CADD/${params.genome}/whole_genome_SNVs.tsv.gz.tbi"
-  containerPath    = '/data0/btb/containers/'
-  genome_base      = params.genome == 'GRCh37' ? '/data0/btb/references/igenomes/Homo_sapiens/GATK/GRCh37/' : params.genome == 'GRCh38' ? '/data0/btb/references/igenomes/Homo_sapiens/GATK/GRCh38/' : 'References/smallGRCh37'
+  cadd_InDels      = "/data1/cache/CADD/${params.genome}/InDels.tsv.gz"
+  cadd_InDels_tbi  = "/data1/cache/CADD/${params.genome}/InDels.tsv.gz.tbi"
+  cadd_WG_SNVs     = "/data1/cache/CADD/${params.genome}/whole_genome_SNVs.tsv.gz"
+  cadd_WG_SNVs_tbi = "/data1/cache/CADD/${params.genome}/whole_genome_SNVs.tsv.gz.tbi"
+  containerPath    = '/data1/containers/'
+  genome_base      = params.genome == 'smallGRCh37' ? 'References/smallGRCh37' : '/data1/references/igenomes/Homo_sapiens/GATK/${params.genome}/'
   singleCPUMem     = 15.GB
-  snpEff_cache     = '/data0/btb/cache/snpEff'
+  snpEff_cache     = '/data1/cache/snpEff'
   totalMemory      = 754.GB
-  vep_cache        = '/data0/btb/cache/VEP'
+  vep_cache        = '/data1/cache/VEP'
 }
 
 executor.$local.cpus = 48
diff --git a/containers/vepgrch37/environment.yml b/containers/vepgrch37/environment.yml
@@ -8,3 +8,4 @@ channels:
 
 dependencies:
   - ensembl-vep=95.2
+  - genesplicer=1.0
diff --git a/containers/vepgrch38/environment.yml b/containers/vepgrch38/environment.yml
@@ -8,3 +8,4 @@ channels:
 
 dependencies:
   - ensembl-vep=95.2
+  - genesplicer=1.0
diff --git a/docs/ANNOTATION.md b/docs/ANNOTATION.md
@@ -53,3 +53,13 @@ Such files are meant to be share between multiple users, so this script is mainl
 ```
 nextflow run build.nf --cadd_cache /Path/To/CADDcache --genome <GENOME>
 ```
+
+## Using VEP GeneSplicer plugin
+
+To enable the use of the VEP GeneSplicer plugin:
+ - use the `--genesplicer` flag
+
+Example:
+```
+nextflow run annotate.nf --tools VEP --annotateVCF file.vcf.gz --genome GRCh38 --genesplicer
+```
diff --git a/docs/CONTAINERS.md b/docs/CONTAINERS.md
@@ -35,6 +35,7 @@ Additional containers need to be downloaded for somatic variant calling with ASC
 - Contain **[FastQC][fastqc-link]** 0.11.8
 - Contain **[FreeBayes][freebayes-link]** 1.2.0
 - Contain **[GATK4][gatk4-link]** 4.0.9.0
+- Contain **[GeneSplicer][genesplicer-link]** 1.0
 - Contain **[HTSlib][htslib-link]** 1.9
 - Contain **[IGVtools][igvtools-link]** 2.3.93
 - Contain **[Manta][manta-link]** 1.4.0
@@ -62,12 +63,14 @@ Additional containers need to be downloaded for somatic variant calling with ASC
 ### vepgrch37 [![vepgrch37-docker status][vepgrch37-docker-badge]][vepgrch37-docker-link]
 
 - Based on `nfcore/base:latest`
+- Contain **[GeneSplicer][genesplicer-link]** 1.0
 - Contain **[VEP][vep-link]** 95.1
 - Contain cache for GRCh37 version 95
 
 ### vepgrch38 [![vepgrch38-docker status][vepgrch38-docker-badge]][vepgrch38-docker-link]
 
 - Based on `nfcore/base:latest`
+- Contain **[GeneSplicer][genesplicer-link]** 1.0
 - Contain **[VEP][vep-link]** 95.1
 - Contain cache for GRCh38 version 95
 
@@ -131,6 +134,7 @@ You'll just need to specify the correct repository either in command line or in
 [fastqc-link]: http://www.bioinformatics.babraham.ac.uk/projects/fastqc/
 [freebayes-link]: https://github.com/ekg/freebayes
 [gatk4-link]: https://github.com/broadinstitute/gatk
+[genesplicer-link]: https://ccb.jhu.edu/software/genesplicer/
 [htslib-link]: https://github.com/samtools/htslib
 [igvtools-link]: http://software.broadinstitute.org/software/igv/
 [manta-link]: https://github.com/Illumina/manta
diff --git a/docs/INSTALL.md b/docs/INSTALL.md
@@ -16,9 +16,15 @@ This workflow itself needs little installation.
 
 Nextflow will automatically fetch Sarek from GitHub when launched if `SciLifeLab/Sarek` is specified as the workflow name.
 
+You can also specify Nextflow to pull Sarek using:
+```bash
+nextflow pull SciLifeLab/Sarek
+```
+
 Sarek use Singularity containers to package all the different tools.
 
-If you plan to use the automatic pull of Singularity images, you can use the [`singularity.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/singularity.config) configuration file. You can also set up the Nextflow environnement variable `NXF_SINGULARITY_CACHEDIR` to choose where to store them.
+If you plan to use the automatic pull of Singularity images, you can use the [`singularity.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/singularity.config) configuration file.
+You can also set up the Nextflow environnement variable `NXF_SINGULARITY_CACHEDIR` to choose where to store them.
 
 For example
 ```bash
diff --git a/environment.yml b/environment.yml
@@ -15,6 +15,7 @@ dependencies:
   - fontconfig=2.12.6 #for FastQC
   - freebayes=1.2.0
   - gatk4=4.0.9.0
+  - genesplicer=1.0
   - htslib=1.9
   - igvtools=2.3.93
   - manta=1.4.0
diff --git a/germlineVC.nf b/germlineVC.nf
@@ -370,15 +370,15 @@ vcfForQC = Channel.empty().mix(
 (vcfForBCFtools, vcfForVCFtools) = vcfForQC.into(2)
 
 process RunBcftoolsStats {
-  tag {vcf}
+  tag {"${variantCaller} - ${vcf}"}
 
   publishDir "${params.outDir}/Reports/BCFToolsStats", mode: params.publishDirMode
 
   input:
     set variantCaller, file(vcf) from vcfForBCFtools
 
   output:
-    file ("${vcf.simpleName}.bcf.tools.stats.out") into bcfReport
+    file ("*.bcf.tools.stats.out") into bcfReport
 
   when: !params.noReports
 
@@ -388,19 +388,21 @@ process RunBcftoolsStats {
 bcfReport.dump(tag:'BCFTools')
 
 process RunVcftools {
-  tag {vcf}
+  tag {"${variantCaller} - ${vcf}"}
 
   publishDir "${params.outDir}/Reports/VCFTools", mode: params.publishDirMode
 
   input:
     set variantCaller, file(vcf) from vcfForVCFtools
 
   output:
-    file ("${vcf.simpleName}.*") into vcfReport
+    file ("${reducedVCF}.*") into vcfReport
 
   when: !params.noReports
 
-  script: QC.vcftools(vcf)
+  script:
+    reducedVCF = SarekUtils.reduceVCF(vcf)
+    QC.vcftools(vcf)
 }
 
 vcfReport.dump(tag:'VCFTools')
diff --git a/lib/QC.groovy b/lib/QC.groovy
@@ -2,7 +2,7 @@ class QC {
 // Run bcftools on vcf file
   static def bcftools(vcf) {
     """
-    bcftools stats ${vcf} > ${vcf.simpleName}.bcf.tools.stats.out
+    bcftools stats ${vcf} > ${SarekUtils.reduceVCF(vcf)}.bcf.tools.stats.out
     """
   }
 
@@ -19,22 +19,22 @@ class QC {
     vcftools \
     --gzvcf ${vcf} \
     --relatedness2 \
-    --out ${vcf.simpleName}
+    --out ${SarekUtils.reduceVCF(vcf)}
 
     vcftools \
     --gzvcf ${vcf} \
     --TsTv-by-count \
-    --out ${vcf.simpleName}
+    --out ${SarekUtils.reduceVCF(vcf)}
 
     vcftools \
     --gzvcf ${vcf} \
     --TsTv-by-qual \
-    --out ${vcf.simpleName}
+    --out ${SarekUtils.reduceVCF(vcf)}
 
     vcftools \
     --gzvcf ${vcf} \
     --FILTER-summary \
-    --out ${vcf.simpleName}
+    --out ${SarekUtils.reduceVCF(vcf)}
     """
   }
 }
diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy
diff --git a/main.nf b/main.nf
diff --git a/somaticVC.nf b/somaticVC.nf

Original file line number	Diff line number	Diff line change
`@@ -8,3 +8,4 @@ channels:`
`8`	`8`
`9`	`9`	`dependencies:`
`10`	`10`	`- ensembl-vep=95.2`
	`11`	`+ - genesplicer=1.0`