Skip to content

Commit

Permalink
feat: Add percent coverage option to stat (#83)
Browse files Browse the repository at this point in the history
* initial commit

* fix combine and parsing thresholds

* off by one

* remove minus one

* fix rm rf command in test-driver

* add perc_cov tests

* update readme

* update cli yml

* write value as scientific notation if small. update test expected output
  • Loading branch information
cademirch authored Sep 24, 2024
1 parent abe76cb commit adbc4c0
Show file tree
Hide file tree
Showing 10 changed files with 160 additions and 5 deletions.
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ FLAGS:
OPTIONS:
-r, --region <bed_file_path> A bed file that describes the region we want to run the stat
-s, --stat <stat_type> The type of statistics we want to perform, by default average. You can specify
statistic methods: mean, median, hist, percentile=X% (If this is not speficied
statistic methods: perc_cov, mean, median, hist, percentile=X% (If this is not speficied
d4tools will use mean by default)
-t, --threads <num_of_threads> Number of threads
Expand Down Expand Up @@ -204,6 +204,19 @@ $ d4tools stat -s percentile=95 -r region.bed hg002.d4
2 0 150000000 38
```

- Percent of bases at or above coverage levels (perc_cov)
```text
$ d4tools stat -H -s perc_cov=1,2 -r data/input_10nt.multiple_ranges.bed data/input_10nt.d4
#Chr Start End 1x 2x
chr 0 2 0.000 0.000
chr 0 8 0.625 0.375
chr 0 10 0.600 0.300
chr 1 6 0.600 0.400
chr 3 9 1.000 0.500
chr 4 5 1.000 1.000
chr 5 10 0.800 0.400
```

### Reading D4 File Served by static HTTP Server

D4 now supports showing and run statistics for D4 files that is served on a HTTP server without downloading the file to local.
Expand Down
2 changes: 2 additions & 0 deletions d4/src/task/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod mean;
mod sum;
mod value_range;
mod vector;
mod perc_cov;

use std::io::Result;

Expand All @@ -15,6 +16,7 @@ pub use mean::Mean;
pub use sum::Sum;
pub use value_range::ValueRange;
pub use vector::VectorStat;
pub use perc_cov::PercentCov;

use crate::d4file::{MultiTrackPartitionReader, MultiTrackReader};

Expand Down
77 changes: 77 additions & 0 deletions d4/src/task/perc_cov.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
use super::{Task, TaskPartition};
use std::iter::Once;

#[derive(Clone)]
pub struct PercentCovPart {
//thresholds should be sorted
thresholds: Vec<u32>,
counts: Vec<u32>,
}

impl TaskPartition<Once<i32>> for PercentCovPart {
type ParentType = PercentCov;
type ResultType = Vec<u32>;
fn new(left: u32, right: u32, parent: &Self::ParentType) -> Self {

Check warning on line 14 in d4/src/task/perc_cov.rs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest)

unused variable: `right`

Check warning on line 14 in d4/src/task/perc_cov.rs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest)

unused variable: `right`

Check warning on line 14 in d4/src/task/perc_cov.rs

View workflow job for this annotation

GitHub Actions / build (macos-latest)

unused variable: `right`

Check warning on line 14 in d4/src/task/perc_cov.rs

View workflow job for this annotation

GitHub Actions / build (macos-latest)

unused variable: `right`
Self {
thresholds: parent.3.clone(),
counts: vec![0; parent.3.len()],
}
}
// #[inline(always)]
fn feed(&mut self, _: u32, value: &mut Once<i32>) -> bool {
let value = value.next().unwrap();
for (i, thresh) in self.thresholds.iter().enumerate() {
if value as u32 >= *thresh {
self.counts[i] += 1
}
}
true
}
// #[inline(always)]
fn feed_range(&mut self, left: u32, right: u32, value: &mut Once<i32>) -> bool {
let value = value.next().unwrap();
for (i, thresh) in self.thresholds.iter().enumerate() {
if value as u32 >= *thresh {
self.counts[i] += right - left as u32
}
}
true
}

fn result(&mut self) -> Self::ResultType {
self.counts.clone()
}
}

pub struct PercentCov(String, u32, u32, Vec<u32>);

impl PercentCov {
pub fn new(chrom: &str, begin: u32, end: u32, thresholds: Vec<u32>) -> Self {
PercentCov(chrom.to_string(), begin, end, thresholds)
}
}

impl Task<std::iter::Once<i32>> for PercentCov {
type Partition = PercentCovPart;
type Output = Vec<f32>;
fn region(&self) -> (&str, u32, u32) {
(self.0.as_str(), self.1, self.2)
}
fn combine(&self, parts: &[Vec<u32>]) -> Self::Output {
let divisor = (self.2 - self.1) as f32;

let mut sums: Vec<u32> = vec![0; self.3.len()];

// Sum the vectors index-wise

for part in parts {
for (i, &value) in part.iter().enumerate() {
sums[i] += value;
}
}

let result: Vec<f32> = sums.into_iter().map(|x| x as f32 / divisor).collect();

result
}
}
2 changes: 1 addition & 1 deletion d4tools/src/stat/cli.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ args:
short: s
long: stat
value_name: stat_type
help: "The type of statistics we want to perform, by default average. You can specify statistic methods: mean, median, hist, percentile=X%, sum, count (If this is not specified d4utils will use mean by default)"
help: "The type of statistics we want to perform, by default average. You can specify statistic methods: mean, median, hist, percentile=X%, perc_cov, sum, count (If this is not specified d4utils will use mean by default)"
- region:
short: r
long: region
Expand Down
56 changes: 54 additions & 2 deletions d4tools/src/stat/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use d4::{
find_tracks,
index::{D4IndexCollection, Sum},
ssio::http::HttpReader,
task::{Histogram, Mean, SimpleTask, Task, TaskOutput},
task::{Histogram, Mean, PercentCov, SimpleTask, Task, TaskOutput},
Chrom, D4TrackReader,
};

Expand Down Expand Up @@ -227,6 +227,55 @@ fn hist_stat(matches: ArgMatches) -> Result<(), Box<dyn std::error::Error>> {

Ok(())
}
fn parse_stat_thresholds(stat: &str) -> Result<Vec<u32>, Box<dyn std::error::Error>> {
let thresholds_str = &stat["perc_cov=".len()..];

let mut thresholds: Vec<u32> = thresholds_str
.split(',')
.map(|s| s.trim().parse())
.collect::<Result<Vec<u32>, _>>()?;
thresholds.sort_unstable();
Ok(thresholds)
}

fn perc_cov_stat(
matches: ArgMatches,
mut print_header: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let stat = matches.value_of("stat").unwrap_or("perc_cov=10,20,30");

let thresholds = parse_stat_thresholds(stat)?;
let mut unused = Vec::new();
let results = open_file_parse_region_and_then(matches, &mut unused, |mut input, regions| {
let tasks: Vec<_> = regions
.into_iter()
.map(|(chr, begin, end)| PercentCov::new(&chr, begin, end, thresholds.clone()))
.collect();

Ok(PercentCov::create_task(&mut input[0], tasks)?.run())
})?;

if print_header {
print!("#Chr\tStart\tEnd");
for t in thresholds.iter() {
print!("\t{}x", t);
}
println!();
print_header = false;
}
for r in results.into_iter() {
print!("{}\t{}\t{}", r.chrom, r.begin, r.end);
for value in r.output.iter() {
if value.abs() >= 0.001 {
print!("\t{:.3}", value);
} else {
print!("\t{:.3e}", value);
}
}
println!();
}
Ok(())
}

fn mean_stat_index<R: Read + Seek>(
mut reader: R,
Expand Down Expand Up @@ -450,12 +499,15 @@ pub fn entry_point(args: Vec<String>) -> Result<(), Box<dyn std::error::Error>>
Some("hist") => {
hist_stat(matches)?;
}
Some(whatever) if whatever.starts_with("perc_cov") => {
perc_cov_stat(matches, !header_printed)?;
}
Some(whatever) if whatever.starts_with("percentile=") => {
let prefix_len = "percentile=".len();
let percentile: f64 = whatever[prefix_len..].parse()?;
percentile_stat(matches, percentile / 100.0, !header_printed)?;
}
_ => panic!("Unsupported stat type"),
_ => panic!("Unsupported stat type: {:?}", matches.value_of("stat")),
}
Ok(())
}
1 change: 1 addition & 0 deletions d4tools/test/stat/perc_cov-multiple-ranges/cmdline
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
${D4TOOLS} stat -s perc_cov=1,2 -r ${DATADIR}/input_10nt.multiple_ranges.bed ${DATADIR}/input_10nt.d4
7 changes: 7 additions & 0 deletions d4tools/test/stat/perc_cov-multiple-ranges/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
chr 0 2 0.000e0 0.000e0
chr 0 8 0.625 0.375
chr 0 10 0.600 0.300
chr 1 6 0.600 0.400
chr 3 9 1.000 0.500
chr 4 5 1.000 1.000
chr 5 10 0.800 0.400
1 change: 1 addition & 0 deletions d4tools/test/stat/perc_cov-overlapping-range/cmdline
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
${D4TOOLS} stat -s perc_cov=1,2 -r ${DATADIR}/input_10nt.overlapping_ranges.bed ${DATADIR}/input_10nt.d4
2 changes: 2 additions & 0 deletions d4tools/test/stat/perc_cov-overlapping-range/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr 0 10 0.600 0.300
chr 4 5 1.000 1.000
2 changes: 1 addition & 1 deletion d4tools/test/test-driver
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ EOF

popd >& /dev/null

rm ${OUTDIR} -rf
rm -rf ${OUTDIR}
done

echo
Expand Down

0 comments on commit adbc4c0

Please sign in to comment.