Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ futures = "0.3.31"
coitrees = "0.4.0"
fnv = "1.0.7"
rand = "0.8.5"
rayon = "1.10.0"
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ shape: (1, 8)

#### 6 threads
```python
pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", "gnomad_site_local", thread_num=4)
pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", "gnomad_site_local", thread_num=6)
pb.sql("select * from gnomad_site_local").collect().count()
```

Expand Down
379 changes: 379 additions & 0 deletions docs/notebooks/raport.ipynb

Large diffs are not rendered by default.

305 changes: 39 additions & 266 deletions docs/notebooks/tutorial.ipynb

Large diffs are not rendered by default.

14 changes: 13 additions & 1 deletion polars_bio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,25 @@
coverage = IntervalOperations.coverage
merge = IntervalOperations.merge

from . qc_op import QCOperations

sequence_quality_score = QCOperations.sequence_quality_score


from .range_utils import Utils
from .qc_utils import QCUtils

vizualize_intervals = Utils.visualize_intervals
visualize_intervals = Utils.visualize_intervals
visualize_mean_quality = QCUtils.visualize_mean_quality
visualize_mean_quality_histogram = QCUtils.visualize_mean_quality_histogram

from .io import IOOperations as data_input
from .polars_ext import PolarsRangesOperations as LazyFrame
from .range_op import FilterOp
from .range_op import IntervalOperations as range_operations
from .range_utils import Utils as utils
from .sql import SQL as data_processing
from .qc_op import QCOperations as qc_operations

POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"

Expand All @@ -57,6 +66,7 @@
"InputFormat",
"data_processing",
"range_operations",
"qc_operations",
# "LazyFrame",
"data_input",
"utils",
Expand All @@ -65,3 +75,5 @@
"ObjectStorageOptions",
"set_option",
]

print("Loaded polars_bio")
2 changes: 1 addition & 1 deletion polars_bio/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import polars as pl
from bioframe import SCHEMAS
from datafusion import DataFrame, SessionContext
from jupyterlab.utils import deprecated
# from jupyterlab.utils import deprecated
from polars.io.plugins import register_io_source
from tqdm.auto import tqdm

Expand Down
110 changes: 110 additions & 0 deletions polars_bio/qc_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from __future__ import annotations

import datafusion
import pandas as pd
import polars as pl
from datafusion import col, literal
from typing_extensions import TYPE_CHECKING, Union

from polars_bio.polars_bio import QCOptions, ReadOptions

from .constants import DEFAULT_INTERVAL_COLUMNS
from .context import ctx
from .interval_op_helpers import (
convert_result,
get_py_ctx,
prevent_column_collision,
read_df_to_datafusion,
)
from .logging import logger
from .qc_op_helpers import _validate_sequence_quality_score_input, qc_operation



if TYPE_CHECKING:
pass
from polars_bio.polars_bio import QCOp


class QCOperations:

@staticmethod
def sequence_quality_score(
df: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
output_type: str = "polars.LazyFrame",
streaming: bool = False,
read_options: Union[ReadOptions, None] = None
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
"""
Calculate sequence quality score as mean value.

Parameters:
df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.

Returns:
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note:
1. The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
This enables efficient processing of large datasets without loading the entire output dataset into memory.
2. Streaming is only supported for polars.LazyFrame output.

Example:
```python

```
"""

_validate_sequence_quality_score_input(
output_type
)

qc_options = QCOptions(
qc_op=QCOp.MeanQuality,
quality_col="quality_scores",
output_col="mean_q",
ascii_offset=33,
streaming=False
)

return qc_operation(
df, qc_options, output_type, ctx, read_options
)

@staticmethod
def sequence_quality_score_histogram(
df: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
output_type: str = "polars.LazyFrame",
streaming: bool = False,
read_options: Union[ReadOptions, None] = None
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
"""
Calculate sequence quality score histogram.

Parameters:
df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.

Returns:
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

"""

_validate_sequence_quality_score_input(
output_type
)

qc_options = QCOptions(
qc_op=QCOp.MeanQualityHistogram,
quality_col="quality_scores",
output_col="mean_q",
ascii_offset=33,
streaming=False
)

return qc_operation(
df, qc_options, output_type, ctx, read_options
)
69 changes: 69 additions & 0 deletions polars_bio/qc_op_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from pathlib import Path
from typing import Union, Optional

import pandas as pd
import polars as pl

from polars_bio.polars_bio import (
BioSessionContext,
QCOptions,
ReadOptions,
qc_operation_scan,
QCOp,
qc_operation_frame,
)

from .constants import TMP_CATALOG_DIR
from .logging import logger
from .range_op_io import _df_to_reader, _get_schema, _rename_columns, range_lazy_scan


def _validate_sequence_quality_score_input( output_type):
assert output_type in [
"polars.LazyFrame",
"polars.DataFrame",
"pandas.DataFrame",
"datafusion.DataFrame",
], "Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported"

def apply_mean_quality(
df: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
qc_options: QCOptions,
output_type: str,
ctx: BioSessionContext,
read_options: Optional[ReadOptions] = None
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
ctx.sync_options()
_validate_sequence_quality_score_input(output_type=output_type)

if isinstance(df, str):
# Obsługa plików na dysku
result = qc_operation_scan(ctx, df, qc_options, read_options)
else:
# Obsługa in-memory DataFrames
df_reader = _df_to_reader(df, qc_options.quality_col[0])
result = qc_operation_frame(ctx, df_reader, qc_options)

if output_type == "polars.DataFrame":
return result.to_polars()
elif output_type == "pandas.DataFrame":
return result.to_pandas()
elif output_type == "polars.LazyFrame":
# jeśli chcesz też wspierać LazyFrame
return pl.LazyFrame.from_dataframe(result.to_polars())
else:
raise ValueError("Unsupported output_type")

def qc_operation(
df: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
qc_options: QCOptions,
output_type: str,
ctx: BioSessionContext,
read_options: Union[ReadOptions, None] = None
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:

if qc_options.qc_op == QCOp.MeanQuality or qc_options.qc_op == QCOp.MeanQualityHistogram:
return apply_mean_quality(df, qc_options, output_type, ctx, read_options)
else:
raise NotImplementedError(f"Unsupported method: {qc_options.method}")

66 changes: 66 additions & 0 deletions polars_bio/qc_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from typing import Union

import bioframe as bf
import pandas as pd
import polars as pl
from matplotlib import pyplot as plt


class QCUtils:
@staticmethod
def visualize_mean_quality(
df: Union[pd.DataFrame, pl.DataFrame], label: str = "mean quality scores"
) -> None:
"""
Visualize mean quality scores

Parameters:
df: Pandas DataFrame or Polars DataFrame. The DataFrame containing mean quality scores
label: TBD

"""
assert isinstance(
df, (pd.DataFrame, pl.DataFrame)
), "df must be a Pandas or Polars DataFrame"
df = df if isinstance(df, pd.DataFrame) else df.to_pandas()
if "mean_c" not in df.columns:
raise ValueError("DataFrame must contain a 'mean_c' column")

plt.figure(figsize=(10, 4))
plt.plot(df["mean_c"], label=label)
plt.title(label)
plt.xlabel("Read index")
plt.ylabel("Mean quality score")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

@staticmethod
def visualize_mean_quality_histogram(
df: Union[pd.DataFrame, pl.DataFrame], label: str = "mean quality scores histogram"
) -> None:
"""
Visualize mean quality scores histogram

Parameters:
df: Pandas DataFrame or Polars DataFrame. The DataFrame containing mean quality scores histogram
label: TBD

"""
assert isinstance(
df, (pd.DataFrame, pl.DataFrame)
), "df must be a Pandas or Polars DataFrame"
df = df if isinstance(df, pd.DataFrame) else df.to_pandas()

if not {"bin_start", "count"}.issubset(df.columns):
raise ValueError("DataFrame must contain 'bin_start' and 'count' columns")

plt.figure(figsize=(10, 4))
plt.bar(df["bin_start"], df["count"], width=1.0, align='center')
plt.title(label)
plt.xlabel("Mean quality score (binned)")
plt.ylabel("Count")
plt.grid(axis="y")
plt.tight_layout()
plt.show()
Loading