From 25299147202690183cd14f47e298210b3421c82d Mon Sep 17 00:00:00 2001 From: "Michael G. Campana" Date: Fri, 13 Sep 2019 13:01:22 -0400 Subject: [PATCH] Gzip handling --- CHANGELOG.md | 3 +++ README.md | 3 ++- vcf2aln.rb | 16 +++++++++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4162d6a..1d1e118 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,9 @@ Michael G. Campana & Jacob A. West-Roberts, 2017-2019 Smithsonian Conservation Biology Institute Contact: campanam@si.edu +### Version 0.11.0 +Read/write gzipped files + ### Version 0.10.0 Probabilistic pseudohaplotype calling Minimum samples called as a percentage option diff --git a/README.md b/README.md index ec615fe..d64618a 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ In the terminal: Optionally, vcf2aln.rb can be placed within the user’s $PATH so that it can be executed from any location. Depending on your operating system, you may need to change the shebang line in the script (first line starting with #!) to specify the path of your Ruby executable. ## Input -vcf2aln requires an all-sites VCF (e.g. such as one produced using EMIT_ALL_SITES in the [Genome Analysis Toolkit](https://software.broadinstitute.org/gatk/)). +vcf2aln requires an all-sites VCF (e.g. such as one produced using EMIT_ALL_SITES in the [Genome Analysis Toolkit](https://software.broadinstitute.org/gatk/)). Files with the final extension ".gz" are assumed to be gzip-compressed. ## Execution Execute the script using `ruby vcf2aln.rb` (or `vcf2aln.rb` if the script is in your $PATH). This will display the help screen. Basic usage is as follows: @@ -33,6 +33,7 @@ vcf2aln can also be used in a pipe. For example, it can directly convert the out `-i, --input [FILE]`: Input VCF file. `--pipe`: Read data from an uncompressed VCF stream rather than a file. `-o, --outprefix [VALUE]`: Output FASTA alignment prefix. +`-z, --gzip`: Gzip output alignments. `-c, --concatenate`: Concatenate markers into single alignment (e.g. concatenate multiple separate chromosomes/contigs). `-s, --skip`: Skip missing sites in VCF. `-O, --onehap`: Print only one haplotype for diploid data. If phasing information is missing, it will generate a pseudohaplotype by randomly assigning one of the alleles. Conflicts with -a. diff --git a/vcf2aln.rb b/vcf2aln.rb index e2bd480..691c65d 100644 --- a/vcf2aln.rb +++ b/vcf2aln.rb @@ -2,7 +2,7 @@ #----------------------------------------------------------------------------------------------- # vcf2aln -VCF2ALNVER = "0.10.0" +VCF2ALNVER = "0.11.0" # Michael G. Campana, Jacob A. West-Roberts, 2017-2019 # Smithsonian Conservation Biology Institute #----------------------------------------------------------------------------------------------- @@ -95,12 +95,16 @@ def print_locus end if $options.alts system("cat *#{$options.outprefix + @name}*.tmp.fa > #{$options.outprefix + @name + '.fa'}") + system("gzip #{$options.outprefix + @name + '.fa'}") if $options.gzip else if $options.onehap system("cat *#{$options.outprefix + @name}.tmp.fa > #{$options.outprefix + @name + '.fa'}") + system("gzip #{$options.outprefix + @name + '.fa'}") if $options.gzip else system("cat *#{$options.outprefix + @name}.hap1.tmp.fa > #{$options.outprefix + @name + '.hap1.fa'}") system("cat *#{$options.outprefix + @name}.hap2.tmp.fa > #{$options.outprefix + @name + '.hap2.fa'}") + system("gzip #{$options.outprefix + @name + '.hap1.fa'}") if $options.gzip + system("gzip #{$options.outprefix + @name + '.hap2.fa'}") if $options.gzip end end end @@ -143,6 +147,7 @@ def self.parse(options) args.adepth = nil #Don't filter allele depth by default args.split_regions = 0 #Don't activate region-split subroutine by default args.write_cycle = 1000000 # Number of cycles before force of write-out + args.gzip = false # Gzip output opt_parser = OptionParser.new do |opts| opts.banner = "Command-line usage: ruby vcf2aln.rb [options]" opts.separator "" @@ -156,6 +161,9 @@ def self.parse(options) opts.on("-o", "--outprefix [VALUE]", String, "Output alignment prefix") do |pref| args.outprefix = pref + "_" if pref != nil end + opts.on("-z", "--gzip", "Gzip output alignments") do + args.gzip = true + end opts.on("-c", "--concatenate", "Concatenate markers into single alignment") do args.concat = true end @@ -683,6 +691,12 @@ def read_input regionval = 0 if $options.pipe ARGF.each_line { |line| index, previous_index, previous_endex, previous_name, current_locus, prev_pos, write_cycle, region, regionval = vcf_to_alignment(line, index, previous_index, previous_endex, previous_name, current_locus, prev_pos, write_cycle, region, regionval) } + elsif $options.infile[-3..-1] == ".gz" + Zlib::GzipReader.open($options.infile) do |vcf2aln| + while line = vcf2aln.gets + index, previous_index, previous_endex, previous_name, current_locus, prev_pos, write_cycle, region, regionval = vcf_to_alignment(line, index, previous_index, previous_endex, previous_name, current_locus, prev_pos, write_cycle, region, regionval) + end + end else File.open($options.infile, 'r') do |vcf2aln| while line = vcf2aln.gets