biodatageeks · mwiewior · Mar 8, 2025 · Mar 8, 2025 · Mar 8, 2025 · Mar 8, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "polars_bio"
-version = "0.7.4"
+version = "0.8.0"
 edition = "2021"
 
 [lib]

diff --git a/README.md b/README.md
@@ -31,13 +31,17 @@ It provides a DataFrame API for genomics data and is designed to be blazing fast
 
 ![count-overlaps-single.png](docs/assets/count-overlaps-single.png)
 
+![coverage-single.png](docs/assets/coverage-single.png)
+
 ## Parallel performance 🏃‍🏃‍
 ![overlap-parallel.png](docs/assets/overlap-parallel.png)
 
 ![overlap-parallel.png](docs/assets/nearest-parallel.png)
 
 ![count-overlaps-parallel.png](docs/assets/count-overlaps-parallel.png)
 
+![coverage-parallel.png](docs/assets/coverage-parallel.png)
+
 
 
 Read the [documentation](https://biodatageeks.github.io/polars-bio/)
diff --git a/docs/assets/coverage-parallel.png b/docs/assets/coverage-parallel.png
diff --git a/docs/assets/coverage-single.png b/docs/assets/coverage-single.png
diff --git a/docs/features.md b/docs/features.md
@@ -10,7 +10,7 @@
 | cluster                                            | :white_check_mark: |                    | :white_check_mark: | :white_check_mark: |                    |                    |
 | [merge](api.md#polars_bio.merge)                   | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |                    | :white_check_mark: |
 | complement                                         | :white_check_mark: | :construction:     |                    | :white_check_mark: | :white_check_mark: |                    |
-| coverage                                           | :white_check_mark: |                    | :white_check_mark: | :white_check_mark: |                    | :white_check_mark: |
+| [coverage](api.md#polars_bio.coverage)             | :white_check_mark: |  :white_check_mark:                  | :white_check_mark: | :white_check_mark: |                    | :white_check_mark: |
 | [expand](api.md#polars_bio.LazyFrame.expand)       | :white_check_mark: | :white_check_mark:     | :white_check_mark: | :white_check_mark: |                    | :white_check_mark: |
 | [sort](api.md#polars_bio.LazyFrame.sort_bedframe)  | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |                    | :white_check_mark: |
 | [read_table](api.md#polars_bio.read_table)         | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |                    | :white_check_mark: |

diff --git a/docs/notebooks/cookbook.ipynb b/docs/notebooks/cookbook.ipynb
@@ -21,24 +21,16 @@
    "id": "62a7b57c30bf54e2",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-03-05T16:41:54.268168Z",
-     "start_time": "2025-03-05T16:41:53.664194Z"
+     "end_time": "2025-03-07T10:02:23.527490Z",
+     "start_time": "2025-03-07T10:02:23.525921Z"
     }
    },
    "source": [
     "import polars_bio as pb\n",
     "import polars as pl"
    ],
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:polars_bio:Creating BioSessionContext\n"
-     ]
-    }
-   ],
-   "execution_count": 2
+   "outputs": [],
+   "execution_count": 8
   },
   {
    "cell_type": "code",
@@ -646,34 +638,34 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-02-28T11:52:12.169029Z",
-     "start_time": "2025-02-28T11:52:12.167384Z"
+     "end_time": "2025-03-07T10:02:16.509828Z",
+     "start_time": "2025-03-07T10:02:16.507744Z"
     }
    },
    "cell_type": "code",
    "source": "gcs_vcf_path = \"gs://genomics-public-data/platinum-genomes/vcf/NA12878_S1.genome.vcf\"",
    "id": "31f0f3d0974245bd",
    "outputs": [],
-   "execution_count": 16
+   "execution_count": 5
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-02-28T11:52:13.441345Z",
-     "start_time": "2025-02-28T11:52:13.439461Z"
+     "end_time": "2025-03-07T10:02:19.374881Z",
+     "start_time": "2025-03-07T10:02:19.372753Z"
     }
    },
    "cell_type": "code",
    "source": "info_fields=[\"AC\", \"AF\"]",
    "id": "816c419b3b45ee44",
    "outputs": [],
-   "execution_count": 17
+   "execution_count": 6
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-02-28T11:52:17.666747Z",
-     "start_time": "2025-02-28T11:52:16.365292Z"
+     "end_time": "2025-03-07T10:02:52.904221Z",
+     "start_time": "2025-03-07T10:02:41.364349Z"
     }
    },
    "cell_type": "code",
@@ -713,12 +705,12 @@
        "<small>shape: (3, 10)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>chrom</th><th>start</th><th>end</th><th>id</th><th>ref</th><th>alt</th><th>qual</th><th>filter</th><th>ac</th><th>af</th></tr><tr><td>str</td><td>u32</td><td>u32</td><td>str</td><td>str</td><td>str</td><td>f64</td><td>str</td><td>list[i32]</td><td>list[f32]</td></tr></thead><tbody><tr><td>&quot;chrM&quot;</td><td>1</td><td>1</td><td>&quot;&quot;</td><td>&quot;G&quot;</td><td>&quot;&quot;</td><td>0.0</td><td>&quot;PASS&quot;</td><td>null</td><td>null</td></tr><tr><td>&quot;chrM&quot;</td><td>2</td><td>72</td><td>&quot;&quot;</td><td>&quot;A&quot;</td><td>&quot;&quot;</td><td>0.0</td><td>&quot;PASS&quot;</td><td>null</td><td>null</td></tr><tr><td>&quot;chrM&quot;</td><td>73</td><td>73</td><td>&quot;&quot;</td><td>&quot;G&quot;</td><td>&quot;A&quot;</td><td>8752.780273</td><td>&quot;TruthSensitivityTranche99.90to…</td><td>[2]</td><td>[1.0]</td></tr></tbody></table></div>"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
-   "execution_count": 18
+   "execution_count": 9
   },
   {
    "metadata": {},

diff --git a/docs/performance.md b/docs/performance.md
@@ -7,12 +7,16 @@
 
 ![count-overlaps-single.png](assets/count-overlaps-single.png)
 
+![coverage-single.png](assets/coverage-single.png)
+
 ## Parallel performance 🏃‍🏃‍
 ![overlap-parallel.png](assets/overlap-parallel.png)
 
 ![overlap-parallel.png](assets/nearest-parallel.png)
 
 ![count-overlaps-parallel.png](assets/count-overlaps-parallel.png)
+
+![coverage-parallel.png](assets/coverage-parallel.png)
 ## Benchmarks 🧪
 ### Detailed results shortcuts 👨‍🔬
 - [Binary operations](#binary-operations)

diff --git a/polars_bio/__init__.py b/polars_bio/__init__.py
@@ -14,7 +14,7 @@
     sql,
 )
 from .polars_ext import PolarsRangesOperations as LazyFrame
-from .range_op import FilterOp, count_overlaps, merge, nearest, overlap
+from .range_op import FilterOp, count_overlaps, coverage, merge, nearest, overlap
 from .range_viz import visualize_intervals
 
 POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
@@ -26,6 +26,7 @@
     "nearest",
     "merge",
     "count_overlaps",
+    "coverage",
     "ctx",
     "FilterOp",
     "visualize_intervals",

diff --git a/polars_bio/polars_ext.py b/polars_bio/polars_ext.py
@@ -222,3 +222,18 @@ def expand(
         if midsk in schema:
             df = df.drop(midsk)
         return df
+
+    def coverage(
+        self,
+        other_df: pl.LazyFrame,
+        cols1=["chrom", "start", "end"],
+        cols2=["chrom", "start", "end"],
+        suffixes: tuple[str, str] = ("_1", "_2"),
+    ) -> pl.LazyFrame:
+        """
+        !!! note
+            Alias for [coverage](api.md#polars_bio.coverage)
+        """
+        return pb.coverage(
+            self._ldf, other_df, cols1=cols1, cols2=cols2, suffixes=suffixes
+        )
diff --git a/polars_bio/range_op.py b/polars_bio/range_op.py
@@ -127,7 +127,7 @@ def nearest(
     read_options: Union[ReadOptions, None] = None,
 ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
     """
-    Find pairs of overlapping genomic intervals.
+    Find pairs of closest genomic intervals.
     Bioframe inspired API.
 
     Parameters:
@@ -173,6 +173,65 @@ def nearest(
     return range_operation(df1, df2, range_options, output_type, ctx, read_options)
 
 
+def coverage(
+    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
+    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
+    overlap_filter: FilterOp = FilterOp.Strict,
+    suffixes: tuple[str, str] = ("_1", "_2"),
+    on_cols: Union[list[str], None] = None,
+    cols1: Union[list[str], None] = ["chrom", "start", "end"],
+    cols2: Union[list[str], None] = ["chrom", "start", "end"],
+    output_type: str = "polars.LazyFrame",
+    streaming: bool = False,
+    read_options: Union[ReadOptions, None] = None,
+) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
+    """
+    Calculate intervals coverage.
+    Bioframe inspired API.
+
+    Parameters:
+        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
+        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
+        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict).
+        cols1: The names of columns containing the chromosome, start and end of the
+            genomic intervals, provided separately for each set.
+        cols2:  The names of columns containing the chromosome, start and end of the
+            genomic intervals, provided separately for each set.
+        suffixes: Suffixes for the columns of the two overlapped sets.
+        on_cols: List of additional column names to join on. default is None.
+        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" are also supported.
+        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming-out-of-core-processing) engine.
+        read_options: Additional options for reading the input files.
+
+
+    Returns:
+        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.
+
+    Note:
+        The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
+        This enables efficient processing of large datasets without loading the entire output dataset into memory.
+
+    Example:
+
+    Todo:
+        Support for on_cols.
+    """
+
+    _validate_overlap_input(cols1, cols2, on_cols, suffixes, output_type, how="inner")
+
+    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
+    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
+    range_options = RangeOptions(
+        range_op=RangeOp.Coverage,
+        filter_op=overlap_filter,
+        suffixes=suffixes,
+        columns_1=cols1,
+        columns_2=cols2,
+        streaming=streaming,
+    )
+    return range_operation(df2, df1, range_options, output_type, ctx, read_options)
+
+
 def count_overlaps(
     df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
     df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],

diff --git a/polars_bio/range_op_helpers.py b/polars_bio/range_op_helpers.py
@@ -58,6 +58,10 @@ def range_operation(
             merged_schema = pl.Schema(
                 {**_get_schema(df1, ctx, None, read_options1), **{"count": pl.Int32}}
             )
+        elif range_options.range_op == RangeOp.Coverage:
+            merged_schema = pl.Schema(
+                {**_get_schema(df1, ctx, None, read_options1), **{"coverage": pl.Int32}}
+            )
         else:
             df_schema1 = _get_schema(df1, ctx, range_options.suffixes[0], read_options1)
             df_schema2 = _get_schema(df2, ctx, range_options.suffixes[1], read_options2)

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "polars-bio"
-version = "0.7.4"
+version = "0.8.0"
 description = "Blazing fast genomic operations on large Python dataframes"
 authors = []
 requires-python = ">=3.9"

diff --git a/src/operation.rs b/src/operation.rs
@@ -80,11 +80,19 @@ pub(crate) fn do_range_operation(
             left_table,
             right_table,
         )),
-        RangeOp::CountOverlapsNaive => rt.block_on(do_count_overlaps_naive(
+        RangeOp::CountOverlapsNaive => rt.block_on(do_count_overlaps_coverage_naive(
             ctx,
             range_options,
             left_table,
             right_table,
+            false,
+        )),
+        RangeOp::Coverage => rt.block_on(do_count_overlaps_coverage_naive(
+            ctx,
+            range_options,
+            left_table,
+            right_table,
+            true,
         )),
 
         _ => panic!("Unsupported operation"),
@@ -145,11 +153,12 @@ async fn do_count_overlaps(
     ctx.sql(&query).await.unwrap()
 }
 
-async fn do_count_overlaps_naive(
+async fn do_count_overlaps_coverage_naive(
     ctx: &ExonSession,
     range_opts: RangeOptions,
     left_table: String,
     right_table: String,
+    coverage: bool,
 ) -> datafusion::dataframe::DataFrame {
     let columns_1 = range_opts.columns_1.unwrap();
     let columns_2 = range_opts.columns_2.unwrap();
@@ -170,13 +179,14 @@ async fn do_count_overlaps_naive(
         columns_1,
         columns_2,
         range_opts.filter_op.unwrap(),
-        false,
+        coverage,
     );
-    session.deregister_table("count_overlaps").unwrap();
+    let table_name = "count_overlaps_coverage".to_string();
+    session.deregister_table(table_name.clone()).unwrap();
     session
-        .register_table("count_overlaps", Arc::new(count_overlaps_provider))
+        .register_table(table_name.clone(), Arc::new(count_overlaps_provider))
         .unwrap();
-    let query = "SELECT * FROM count_overlaps";
+    let query = format!("SELECT * FROM {}", table_name);
     debug!("Query: {}", query);
     ctx.sql(&query).await.unwrap()
 }