Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "polars_bio"
version = "0.7.4"
version = "0.8.0"
edition = "2021"

[lib]
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,17 @@ It provides a DataFrame API for genomics data and is designed to be blazing fast

![count-overlaps-single.png](docs/assets/count-overlaps-single.png)

![coverage-single.png](docs/assets/coverage-single.png)

## Parallel performance 🏃‍🏃‍
![overlap-parallel.png](docs/assets/overlap-parallel.png)

![overlap-parallel.png](docs/assets/nearest-parallel.png)

![count-overlaps-parallel.png](docs/assets/count-overlaps-parallel.png)

![coverage-parallel.png](docs/assets/coverage-parallel.png)



Read the [documentation](https://biodatageeks.github.io/polars-bio/)
Binary file added docs/assets/coverage-parallel.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/assets/coverage-single.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion docs/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
| cluster | :white_check_mark: | | :white_check_mark: | :white_check_mark: | | |
| [merge](api.md#polars_bio.merge) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | :white_check_mark: |
| complement | :white_check_mark: | :construction: | | :white_check_mark: | :white_check_mark: | |
| coverage | :white_check_mark: | | :white_check_mark: | :white_check_mark: | | :white_check_mark: |
| [coverage](api.md#polars_bio.coverage) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | :white_check_mark: |
| [expand](api.md#polars_bio.LazyFrame.expand) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | :white_check_mark: |
| [sort](api.md#polars_bio.LazyFrame.sort_bedframe) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | :white_check_mark: |
| [read_table](api.md#polars_bio.read_table) | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | :white_check_mark: |
Expand Down
36 changes: 14 additions & 22 deletions docs/notebooks/cookbook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,16 @@
"id": "62a7b57c30bf54e2",
"metadata": {
"ExecuteTime": {
"end_time": "2025-03-05T16:41:54.268168Z",
"start_time": "2025-03-05T16:41:53.664194Z"
"end_time": "2025-03-07T10:02:23.527490Z",
"start_time": "2025-03-07T10:02:23.525921Z"
}
},
"source": [
"import polars_bio as pb\n",
"import polars as pl"
],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:polars_bio:Creating BioSessionContext\n"
]
}
],
"execution_count": 2
"outputs": [],
"execution_count": 8
},
{
"cell_type": "code",
Expand Down Expand Up @@ -646,34 +638,34 @@
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-28T11:52:12.169029Z",
"start_time": "2025-02-28T11:52:12.167384Z"
"end_time": "2025-03-07T10:02:16.509828Z",
"start_time": "2025-03-07T10:02:16.507744Z"
}
},
"cell_type": "code",
"source": "gcs_vcf_path = \"gs://genomics-public-data/platinum-genomes/vcf/NA12878_S1.genome.vcf\"",
"id": "31f0f3d0974245bd",
"outputs": [],
"execution_count": 16
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-28T11:52:13.441345Z",
"start_time": "2025-02-28T11:52:13.439461Z"
"end_time": "2025-03-07T10:02:19.374881Z",
"start_time": "2025-03-07T10:02:19.372753Z"
}
},
"cell_type": "code",
"source": "info_fields=[\"AC\", \"AF\"]",
"id": "816c419b3b45ee44",
"outputs": [],
"execution_count": 17
"execution_count": 6
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-02-28T11:52:17.666747Z",
"start_time": "2025-02-28T11:52:16.365292Z"
"end_time": "2025-03-07T10:02:52.904221Z",
"start_time": "2025-03-07T10:02:41.364349Z"
}
},
"cell_type": "code",
Expand Down Expand Up @@ -713,12 +705,12 @@
"<small>shape: (3, 10)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>chrom</th><th>start</th><th>end</th><th>id</th><th>ref</th><th>alt</th><th>qual</th><th>filter</th><th>ac</th><th>af</th></tr><tr><td>str</td><td>u32</td><td>u32</td><td>str</td><td>str</td><td>str</td><td>f64</td><td>str</td><td>list[i32]</td><td>list[f32]</td></tr></thead><tbody><tr><td>&quot;chrM&quot;</td><td>1</td><td>1</td><td>&quot;&quot;</td><td>&quot;G&quot;</td><td>&quot;&quot;</td><td>0.0</td><td>&quot;PASS&quot;</td><td>null</td><td>null</td></tr><tr><td>&quot;chrM&quot;</td><td>2</td><td>72</td><td>&quot;&quot;</td><td>&quot;A&quot;</td><td>&quot;&quot;</td><td>0.0</td><td>&quot;PASS&quot;</td><td>null</td><td>null</td></tr><tr><td>&quot;chrM&quot;</td><td>73</td><td>73</td><td>&quot;&quot;</td><td>&quot;G&quot;</td><td>&quot;A&quot;</td><td>8752.780273</td><td>&quot;TruthSensitivityTranche99.90to…</td><td>[2]</td><td>[1.0]</td></tr></tbody></table></div>"
]
},
"execution_count": 18,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 18
"execution_count": 9
},
{
"metadata": {},
Expand Down
4 changes: 4 additions & 0 deletions docs/performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@

![count-overlaps-single.png](assets/count-overlaps-single.png)

![coverage-single.png](assets/coverage-single.png)

## Parallel performance 🏃‍🏃‍
![overlap-parallel.png](assets/overlap-parallel.png)

![overlap-parallel.png](assets/nearest-parallel.png)

![count-overlaps-parallel.png](assets/count-overlaps-parallel.png)

![coverage-parallel.png](assets/coverage-parallel.png)
## Benchmarks 🧪
### Detailed results shortcuts 👨‍🔬
- [Binary operations](#binary-operations)
Expand Down
3 changes: 2 additions & 1 deletion polars_bio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
sql,
)
from .polars_ext import PolarsRangesOperations as LazyFrame
from .range_op import FilterOp, count_overlaps, merge, nearest, overlap
from .range_op import FilterOp, count_overlaps, coverage, merge, nearest, overlap
from .range_viz import visualize_intervals

POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
Expand All @@ -26,6 +26,7 @@
"nearest",
"merge",
"count_overlaps",
"coverage",
"ctx",
"FilterOp",
"visualize_intervals",
Expand Down
15 changes: 15 additions & 0 deletions polars_bio/polars_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,18 @@ def expand(
if midsk in schema:
df = df.drop(midsk)
return df

def coverage(
self,
other_df: pl.LazyFrame,
cols1=["chrom", "start", "end"],
cols2=["chrom", "start", "end"],
suffixes: tuple[str, str] = ("_1", "_2"),
) -> pl.LazyFrame:
"""
!!! note
Alias for [coverage](api.md#polars_bio.coverage)
"""
return pb.coverage(
self._ldf, other_df, cols1=cols1, cols2=cols2, suffixes=suffixes
)
61 changes: 60 additions & 1 deletion polars_bio/range_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def nearest(
read_options: Union[ReadOptions, None] = None,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
"""
Find pairs of overlapping genomic intervals.
Find pairs of closest genomic intervals.
Bioframe inspired API.

Parameters:
Expand Down Expand Up @@ -173,6 +173,65 @@ def nearest(
return range_operation(df1, df2, range_options, output_type, ctx, read_options)


def coverage(
df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
overlap_filter: FilterOp = FilterOp.Strict,
suffixes: tuple[str, str] = ("_1", "_2"),
on_cols: Union[list[str], None] = None,
cols1: Union[list[str], None] = ["chrom", "start", "end"],
cols2: Union[list[str], None] = ["chrom", "start", "end"],
output_type: str = "polars.LazyFrame",
streaming: bool = False,
read_options: Union[ReadOptions, None] = None,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
"""
Calculate intervals coverage.
Bioframe inspired API.

Parameters:
df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED and Parquet are supported.
overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict).
cols1: The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set.
cols2: The names of columns containing the chromosome, start and end of the
genomic intervals, provided separately for each set.
suffixes: Suffixes for the columns of the two overlapped sets.
on_cols: List of additional column names to join on. default is None.
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" are also supported.
streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming-out-of-core-processing) engine.
read_options: Additional options for reading the input files.


Returns:
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note:
The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
This enables efficient processing of large datasets without loading the entire output dataset into memory.

Example:

Todo:
Support for on_cols.
"""

_validate_overlap_input(cols1, cols2, on_cols, suffixes, output_type, how="inner")

cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
range_options = RangeOptions(
range_op=RangeOp.Coverage,
filter_op=overlap_filter,
suffixes=suffixes,
columns_1=cols1,
columns_2=cols2,
streaming=streaming,
)
return range_operation(df2, df1, range_options, output_type, ctx, read_options)


def count_overlaps(
df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
Expand Down
4 changes: 4 additions & 0 deletions polars_bio/range_op_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ def range_operation(
merged_schema = pl.Schema(
{**_get_schema(df1, ctx, None, read_options1), **{"count": pl.Int32}}
)
elif range_options.range_op == RangeOp.Coverage:
merged_schema = pl.Schema(
{**_get_schema(df1, ctx, None, read_options1), **{"coverage": pl.Int32}}
)
else:
df_schema1 = _get_schema(df1, ctx, range_options.suffixes[0], read_options1)
df_schema2 = _get_schema(df2, ctx, range_options.suffixes[1], read_options2)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "polars-bio"
version = "0.7.4"
version = "0.8.0"
description = "Blazing fast genomic operations on large Python dataframes"
authors = []
requires-python = ">=3.9"
Expand Down
22 changes: 16 additions & 6 deletions src/operation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,19 @@ pub(crate) fn do_range_operation(
left_table,
right_table,
)),
RangeOp::CountOverlapsNaive => rt.block_on(do_count_overlaps_naive(
RangeOp::CountOverlapsNaive => rt.block_on(do_count_overlaps_coverage_naive(
ctx,
range_options,
left_table,
right_table,
false,
)),
RangeOp::Coverage => rt.block_on(do_count_overlaps_coverage_naive(
ctx,
range_options,
left_table,
right_table,
true,
)),

_ => panic!("Unsupported operation"),
Expand Down Expand Up @@ -145,11 +153,12 @@ async fn do_count_overlaps(
ctx.sql(&query).await.unwrap()
}

async fn do_count_overlaps_naive(
async fn do_count_overlaps_coverage_naive(
ctx: &ExonSession,
range_opts: RangeOptions,
left_table: String,
right_table: String,
coverage: bool,
) -> datafusion::dataframe::DataFrame {
let columns_1 = range_opts.columns_1.unwrap();
let columns_2 = range_opts.columns_2.unwrap();
Expand All @@ -170,13 +179,14 @@ async fn do_count_overlaps_naive(
columns_1,
columns_2,
range_opts.filter_op.unwrap(),
false,
coverage,
);
session.deregister_table("count_overlaps").unwrap();
let table_name = "count_overlaps_coverage".to_string();
session.deregister_table(table_name.clone()).unwrap();
session
.register_table("count_overlaps", Arc::new(count_overlaps_provider))
.register_table(table_name.clone(), Arc::new(count_overlaps_provider))
.unwrap();
let query = "SELECT * FROM count_overlaps";
let query = format!("SELECT * FROM {}", table_name);
debug!("Query: {}", query);
ctx.sql(&query).await.unwrap()
}
Expand Down
Loading