Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9fd6258
add merge function
Jan 28, 2025
6604486
add placeholder suffix
Jan 28, 2025
38907b9
fix test typo
Jan 28, 2025
6c01427
add merge test file
Jan 28, 2025
a630c1c
add missing import
Jan 29, 2025
2e9c7fe
allow reading parquet directory
Jan 31, 2025
5e7728c
fix type difference
Feb 8, 2025
75bf6ef
change start_end casting position
Feb 8, 2025
5b32fdb
fix input.csv so that it does not contain start==end (assumption of t…
Feb 8, 2025
d08bf0c
fix expected merge
Feb 8, 2025
df4e356
merge operation changes
Feb 8, 2025
791371d
cast final merge result to original types
Feb 8, 2025
fb40f74
fix placement of reading schema for the merge function
Feb 8, 2025
cb375c4
add workaround for bioframe type changes
Feb 8, 2025
e1367d1
add cluster function
Feb 9, 2025
531a67c
fix expected table types
Feb 9, 2025
0254864
add coverage and support returning datafusion dataframes
Feb 11, 2025
3d00948
Merge branch 'master' into coverage
zkeram Feb 11, 2025
1197f10
fix polars_ext closing parentheses
Feb 11, 2025
75ecb9f
remove unnecessary if
Feb 11, 2025
e0421e7
close delimeter
Feb 11, 2025
e46eab0
assure that if coverage's on_cols=None, then the underlying merge's o…
Feb 11, 2025
462fcce
add datafusion.DataFrame as accepted input
Feb 11, 2025
a5c4ebf
fix typo
Feb 11, 2025
aad0432
fix typo: pandas.LazyFrame -> polars.LazyFrame
Feb 11, 2025
13a0e14
fix test column suffixes
Feb 11, 2025
253ab61
Merge branch 'master' into coverage
zkeram Mar 2, 2025
bcc8e09
fix count_overlaps input
Mar 2, 2025
615d927
Merge branch 'master' into coverage
zkeram Mar 10, 2025
2ef2665
fix: coverage does not support overlap_filter
Mar 10, 2025
ddd09e7
TestCoverageNative to ignore types
Mar 10, 2025
f8cd0cb
ignore types in testing for count_overlaps
Mar 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion polars_bio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
sql,
)
from .polars_ext import PolarsRangesOperations as LazyFrame
from .range_op import FilterOp, count_overlaps, coverage, merge, nearest, overlap
from .range_op import FilterOp, cluster, count_overlaps, coverage, merge, nearest, overlap
from .range_viz import visualize_intervals

POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
Expand All @@ -24,7 +24,9 @@
__all__ = [
"overlap",
"nearest",
"coverage",
"merge",
"cluster",
"count_overlaps",
"coverage",
"ctx",
Expand Down
20 changes: 14 additions & 6 deletions polars_bio/interval_op_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@ def get_py_ctx() -> datafusion.context.SessionContext:

def read_df_to_datafusion(
py_ctx: datafusion.context.SessionContext,
df: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
) -> datafusion.dataframe:
df: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame, datafusion.dataframe.DataFrame]
) -> datafusion.dataframe.DataFrame:

if isinstance(df, pl.DataFrame):
return py_ctx.from_polars(df)
elif isinstance(df, pd.DataFrame):
return py_ctx.from_pandas(df)
elif isinstance(df, pl.LazyFrame):
return py_ctx.from_polars(df.collect())
elif isinstance(df, datafusion.dataframe.DataFrame):
return df
elif isinstance(df, str):
ext = Path(df).suffix
if ext == ".csv":
Expand All @@ -46,8 +49,9 @@ def read_df_to_datafusion(
return py_ctx.read_parquet(df)
raise ValueError("Invalid `df` argument.")


def df_to_lazyframe(df: datafusion.DataFrame) -> pl.LazyFrame:
def df_to_lazyframe(
df: datafusion.dataframe.DataFrame
) -> pl.LazyFrame:
# TODO: make it actually lazy
"""
def _get_lazy(
Expand All @@ -63,8 +67,10 @@ def _get_lazy(


def convert_result(
df: datafusion.DataFrame, output_type: str, streaming: bool
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
df: datafusion.dataframe.DataFrame,
output_type: str,
streaming: bool
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.dataframe.DataFrame]:
# TODO: implement streaming
if streaming:
# raise NotImplementedError("streaming is not implemented")
Expand All @@ -75,4 +81,6 @@ def convert_result(
return df.to_pandas()
elif output_type == "polars.LazyFrame":
return df_to_lazyframe(df)
elif output_type == "datafusion.DataFrame":
return df
raise ValueError("Invalid `output_type` argument")
53 changes: 53 additions & 0 deletions polars_bio/polars_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,59 @@ def nearest(
cols1=cols1,
cols2=cols2,
)

def coverage(
self,
other_df: pl.LazyFrame,
suffixes: tuple[str, str] = ("", "_"),
cols1=["chrom", "start", "end"],
cols2=["chrom", "start", "end"],
) -> pl.LazyFrame:
"""
!!! note
Alias for [coverage](api.md#polars_bio.coverage)
"""
return pb.coverage(
self._ldf,
other_df,
suffixes=suffixes,
cols1=cols1,
cols2=cols2,
)

def merge(
self,
overlap_filter: FilterOp = FilterOp.Strict,
min_dist: float = 0,
cols: Union[list[str], None] = None,
) -> pl.LazyFrame:
"""
!!! note
Alias for [merge](api.md#polars_bio.merge)
"""
return pb.merge(
self._ldf,
overlap_filter=overlap_filter,
min_dist=min_dist,
cols=cols
)

def cluster(
self,
overlap_filter: FilterOp = FilterOp.Strict,
min_dist: float = 0,
cols: Union[list[str], None] = None,
) -> pl.LazyFrame:
"""
!!! note
Alias for [cluster](api.md#polars_bio.cluster)
"""
return pb.cluster(
self._ldf,
overlap_filter=overlap_filter,
min_dist=min_dist,
cols=cols
)

def count_overlaps(
self,
Expand Down
Loading