galaxyproject · richard-burhans · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 19, 2025
diff --git a/tools/rdeval/.shed.yml b/tools/rdeval/.shed.yml
@@ -0,0 +1,13 @@
+categories:
+- Statistics
+description: rdeval is a General purpose, multithreaded read analysis and manipulation tool.
+homepage_url: https://github.com/vgl-hub/rdeval
+long_description: |
+    rdeval is a single, fast and exhaustive tool for summary statistics
+    and simultaneous manipulation of sequence read files in fa*[.gz],
+    bam, and cram formats. rdeval also allows seamless file conversion
+    between formats.
+name: rdeval
+owner: iuc
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/rdeval
+type: unrestricted
diff --git a/tools/rdeval/macros.xml b/tools/rdeval/macros.xml
@@ -0,0 +1,15 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">rdeval</requirement>
+        </requirements>
+    </xml>
+    <token name="@TOOL_VERSION@">0.0.5</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">23.02</token>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1101/2025.02.01.636073</citation>
+        </citations>
+    </xml>
+</macros>
diff --git a/tools/rdeval/rdeval.xml b/tools/rdeval/rdeval.xml
@@ -0,0 +1,205 @@
+<tool id="rdeval" name="rdeval" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Multithreaded read analysis and manipulation tool</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+    #import re
+    #set $mangled_inputs = []
+    #for $input in $input_reads
+        #set $mangled_base = re.sub(r"[^\w\-\s]", "_", str($input.element_identifier))
+        #set $mangled_input = $mangled_base + "." + str($input.ext)
+        #silent $mangled_inputs.append($mangled_input)
+        ln -s '$input' '$mangled_input' &&
+    #end for
+    #if $output_options.output_type.type_selector == "combined_reads"
+        ln -s '$reads_outfile' 'output.${output_type.format_selector}' &&
+    #end if
+    rdeval --input-reads #echo " ".join([f"'{input}'" for $input in $mangled_inputs])
+    #if $expected_gsize
+        $expected_gsize
+    #end if
+    #if $input_filter.filter_selector == "exclude_file"
+        --exclude-list '$exclude_file'
+    #else if $input_filter.filter_selector == "include_file"
+        --include-list '$include_file'
+    #end if
+    #if $filter
+        --filter '$filter'
+    #end if
+        --sample '$sample'
+    #if $random_seed
+        --random-seed '$random_seed'
+    #end if
+    #if $homopolymer_compress
+        --homopolymer-compress '$homopolymer_compress'
+    #end if
+    #if $stats_flavor.flavor_selector == "stats"
+        $sequence_report
+    #else if $stats_flavor.flavor_selector == "quality"
+        --quality '$quality'
+    #else if $stats_flavor.flavor_selector == "size"
+        --out-size '$out_size'
+    #end if
+    #if $output_options.output_type.type_selector == "rd_file"
+        $md5
+        -o output.rd
+    #else if $output_options.output_type.type_selector == "combined_reads"
+        -o 'output.${output_type.format_selector}'
+    #end if
+        $verbose
+        --tabular
+        --threads \${GALAXY_SLOTS:-2}
+        > '$stats_outfile'
+    ]]></command>
+    <inputs>
+        <param argument="--input-reads" type="data" format="bam,cram,fasta,fasta.gz,fastq,fastq.gz" multiple="true" label="Input dataset" help="FASTA/FASTQ, BAM, or CRAM files."/>
+        <param name="expected_gsize" type="integer" min="0" optional="true" label="Expected Genome Size" help="Integer (e.g., 3000000000 for human)."/>
+        <section name="input_filter" title="Filter input reads" expanded="false">
+            <conditional name="file_filter">
+                <param name="filter_selector" type="select" label="Use an exclude or include file">
+                    <option value="no_file" selected="true">no</option>
+                    <option value="exclude_file">Use an exclude file</option>
+                    <option value="include_file">Use an include file</option>
+                </param>
+                <when value="no_file"/>
+                <when value="exclude_file">
+                    <param argument="--exclude-list" type="data" format="txt" optional="true" label="File containing headers to exclude"/>
+                </when>
+                <when value="include_file">
+                    <param argument="--include-list" type="data" format="txt" optional="true" label="File containing headers to include"/>
+                </when>
+            </conditional>
+            <param argument="--filter" type="text" optional="true" label="filter" help="e.g. l&gt;1000 &amp; q&gt;20"/>
+        </section>
+        <section name="input_subsample" title="Subsample input reads" expanded="false">
+            <param argument="--sample" type="float" min="0" max="1" value="1" label="fraction of reads to subsample"/>
+            <param argument="--random-seed" type="integer" min="0" optional="true" label="random seed to make subsampling reproducible"/>
+            <param argument="--homopolymer-compress" type="integer" min="0" optional="true" label="Compress homopolymers longer than n in the input"/>
+        </section>
+        <section name="output_options" title="Output options">
+            <conditional name="stats_flavor">
+                <param name="flavor_selector" type="select" label="Stats output">
+                    <option value="stats" selected="true">Stats</option>
+                    <option value="quality">Quality</option>
+                    <option value="size">Size</option>
+                </param>
+                <when value="stats">
+                    <param argument="--sequence-report" type="boolean" checked="false" truevalue="--sequence-report" falsevalue="" label="Per read sequence report"/>
+                </when>
+                <when value="quality">
+                    <param argument="--quality" type="select" optional="true" label="quality type">
+                        <option value="q" selected="true">Average quality for each read</option>
+                        <option value="a">Both length and quality for each read</option>
+                    </param>
+                </when>
+                <when value="size">
+                    <param argument="--out-size" type="select" optional="true" label="size list type">
+                        <option value="u" selected="true">unsorted</option>
+                        <option value="s">sorted</option>
+                        <option value="h">histogram</option>
+                        <option value="c">inverse cumulative table</option>
+                    </param>
+                </when>
+            </conditional>
+            <conditional name="output_type">
+                <param name="type_selector" type="select" label="output type">
+                    <option value="rd_file" selected="true">RD file</option>
+                    <option value="combined_reads">Combined reads</option>
+                </param>
+                <when value="combined_reads">
+                    <param name="format_selector" type="select" optional="true" label="Output format">
+                        <option value="fasta.gz" selected="true">fasta</option>
+                        <option value="fastq.gz">fastq</option>
+                        <option value="bam">bam</option>
+                        <option value="cram">cram</option>
+                    </param>
+                </when>
+                <when value="rd_file">
+                    <param argument="--md5" type="boolean" checked="false" truevalue="--md5" falsevalue="" label="Print md5 of .rd files"/>
+                </when>
+            </conditional>
+            <param argument="--verbose" type="boolean" checked="false" truevalue="--verbose" falsevalue="" label="Verbose output"/>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="stats_outfile" format="tabular" label="Rdeval summary"/>
+        <data name="rd_outfile" from_work_dir="output.rd" format="binary" label="RD File">
+            <filter>output_options["output_type"]["type_selector"] == "rd_file"</filter>
+        </data>
+        <data name="reads_outfile" format="binary" label="Output reads">
+            <filter>output_options["output_type"]["type_selector"] == "combined_reads"</filter>
+            <change_format>
+                <when input="format_selector" value="fasta.gz" format="fasta.gz"/>
+                <when input="format_selector" value="fastq.gz" format="fastq.gz"/>
+                <when input="format_selector" value="bam" format="bam"/>
+                <when input="format_selector" value="cram" format="cram"/>
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="input_reads" value="test1.fasta.gz" ftype="fasta.gz"/>
+            <output name="stats_outfile" file="output1.tabular" ftype="tabular"/>
+            <output name="rd_outfile" ftype="binary">
+                <assert_contents>
+                    <has_size size="109" delta="1"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="input_reads" value="test1.fastq.gz" ftype="fastq.gz"/>
+            <output name="stats_outfile" file="output1.tabular" ftype="tabular" lines_diff="2"/>
+            <output name="rd_outfile" ftype="binary">
+                <assert_contents>
+                    <has_size size="128" delta="1"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="input_reads" value="test2.bam" ftype="bam"/>
+            <param name="type_selector" value="combined_reads"/>
+            <output name="stats_outfile" file="output2.tabular" ftype="tabular"/>
+            <param name="format_selector" value="fastq.gz"/>
+            <output name="reads_outfile" file="output2.fastq.gz" ftype="fastq.gz"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+**rdeval** is a general-purpose, multithreaded tool for analyzing and manipulating reads (FASTA/FASTQ/BAM/CRAM/RD).
+
+        rdeval input.fa*[.gz]|bam|cram|rd [expected genome size]
+
+::
+
+    Dataset report example:
+
+    +++Read summary+++:
+    # reads: 10000
+    Total read length: 134014104
+    Average read length: 13401.41
+    Read N50: 14270
+    Smallest read length: 1142
+    Largest read length: 40910
+    Coverage: inf
+    GC content %: 43.78
+    Base composition (A:C:T:G): 37693226:29331833:37655925:29333120
+    Average per base quality: 26.47
+
+::
+
+    Per sequence/read report (--sequence-report) example:
+
+    Header  Comment Length  A       C       G       T       N       GC      Average Quality
+    m54306U_210528_154706/69206614/ccs              22812   6170    5146    4802    6694    0       0.44    89.9705
+    m54306U_210528_154706/25888573/ccs              32200   9162    7270    7112    8656    0       0.45    56.8306
+    m54306U_210528_154706/40634168/ccs              8487    2443    1858    1876    2310    0       0.44    90.3828
+    m54306U_210528_154706/103745617/ccs             16496   4546    3752    3760    4438    0       0.46    88.3554
+
+**Attribution**
+
+This tool relies on the gfastar suite and the gfalibs toolkit `vgl-hub/gfalibs <https://github.com/vgl-hub/gfalibs>`_, developed by Giulio Formenti at the Rockefeller University
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/rdeval/test-data/output1.tabular b/tools/rdeval/test-data/output1.tabular
@@ -0,0 +1,10 @@
+# reads	5
+Total read length	50
+Average read length	10.00
+Read N50	15
+Smallest read length	5
+Largest read length	15
+Coverage	inf
+GC content %	50.00
+Base composition (A:C:T:G)	9:14:11:6
+Average per base quality	0.00
diff --git a/tools/rdeval/test-data/output2.fastq.gz b/tools/rdeval/test-data/output2.fastq.gz
diff --git a/tools/rdeval/test-data/output2.tabular b/tools/rdeval/test-data/output2.tabular
@@ -0,0 +1,10 @@
+# reads	11
+Total read length	264855
+Average read length	24077.73
+Read N50	24322
+Smallest read length	17465
+Largest read length	36274
+Coverage	inf
+GC content %	40.81
+Base composition (A:C:T:G)	79479:54455:77277:53644
+Average per base quality	23.58
diff --git a/tools/rdeval/test-data/test1.fasta.gz b/tools/rdeval/test-data/test1.fasta.gz
diff --git a/tools/rdeval/test-data/test1.fastq.gz b/tools/rdeval/test-data/test1.fastq.gz
diff --git a/tools/rdeval/test-data/test2.bam b/tools/rdeval/test-data/test2.bam