Skip to content

Commit cace1a2

Browse files
authored
Merge pull request #1 from bscisel/dev-jwinter
Dev jwinter
2 parents 19a650b + 5113968 commit cace1a2

17 files changed

+2683
-5
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ benchmark/bin/env.sh
1515
benchmark/src/results
1616
benchmark/src/results/overlap
1717
mprofile*dat
18-
*csv
18+
*csv
19+
!docs/notebooks/example.csv

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/notebooks/base_sequence_quality.ipynb

Lines changed: 493 additions & 0 deletions
Large diffs are not rendered by default.

docs/notebooks/example.csv

Lines changed: 201 additions & 0 deletions
Large diffs are not rendered by default.

docs/notebooks/example.fastq

Lines changed: 800 additions & 0 deletions
Large diffs are not rendered by default.

docs/notebooks/example.parquet

19.9 KB
Binary file not shown.

docs/notebooks/report.html

Lines changed: 250 additions & 0 deletions
Large diffs are not rendered by default.

polars_bio/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from polars_bio.polars_bio import InputFormat, ReadOptions, VcfReadOptions
22

3+
from .base_sequnce_quality_vis import visualize_base_sequence_quality
34
from .context import ctx, set_option
45
from .io import (
56
describe_vcf,
@@ -16,6 +17,7 @@
1617
from .polars_ext import PolarsRangesOperations as LazyFrame
1718
from .range_op import FilterOp, count_overlaps, coverage, merge, nearest, overlap
1819
from .range_viz import visualize_intervals
20+
from .quality_stats import base_sequence_quality
1921

2022
POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
2123

@@ -29,6 +31,7 @@
2931
"coverage",
3032
"ctx",
3133
"FilterOp",
34+
"visualize_base_sequence_quality",
3235
"visualize_intervals",
3336
"read_bam",
3437
"read_vcf",
@@ -45,4 +48,5 @@
4548
"ReadOptions",
4649
"VcfReadOptions",
4750
"set_option",
51+
"base_sequence_quality",
4852
]
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from typing import Union
2+
3+
import pandas as pd
4+
import polars as pl
5+
from matplotlib import pyplot as plt
6+
7+
8+
def visualize_base_sequence_quality(df: Union[pd.DataFrame, pl.DataFrame]) -> None:
9+
"""
10+
Visualize the overlapping intervals.
11+
12+
Parameters:
13+
df: Pandas DataFrame or Polars DataFrame. The DataFrame containing the base sequence quality results
14+
"""
15+
assert isinstance(
16+
df, (pd.DataFrame, pl.DataFrame)
17+
), "df must be a Pandas or Polars DataFrame"
18+
df = df if isinstance(df, pd.DataFrame) else df.to_pandas()
19+
df = df.sort_values(by="pos")
20+
21+
boxes = [
22+
{
23+
"label": int(row["pos"]),
24+
"whislo": row["lower"],
25+
"q1": row["q1"],
26+
"med": row["median"],
27+
"q3": row["q3"],
28+
"whishi": row["upper"],
29+
}
30+
for _, row in df.iterrows()
31+
]
32+
33+
fig, ax = plt.subplots()
34+
fig.set_size_inches(15, 5)
35+
36+
plot = ax.plot(df["pos"] + 1, df["avg"])
37+
box_plot = ax.bxp(boxes, showfliers=False)
38+
39+
ax.set_title("base sequence quality")
40+
ax.set_ylabel("Phred score")
41+
ax.set_xlabel("Position in read (bp)")
42+
43+
ax.legend(
44+
[plot[0], box_plot["medians"][0]],
45+
["Average of phred score", "Median of phred score"],
46+
)
47+
48+
for label in ax.get_xticklabels():
49+
label.set_fontsize(6)
50+
51+
plt.show()

polars_bio/quality_stats.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from pathlib import Path
2+
from typing import Union
3+
import datafusion
4+
import polars as pl
5+
import pandas as pd
6+
import pyarrow as pa
7+
from .context import ctx
8+
from polars_bio.polars_bio import (
9+
base_sequance_quality_scan,
10+
base_sequance_quality_frame,
11+
)
12+
13+
14+
def base_sequence_quality(
15+
df: Union[str, Path, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
16+
quality_scores_column: str = "quality_scores",
17+
output_type: str = "polars.DataFrame",
18+
target_partitions: int = 8,
19+
) -> Union[pl.DataFrame, pd.DataFrame]:
20+
"""
21+
Compute base sequence quality statistics from various dataframe/file types.
22+
23+
Args:
24+
df: Input data as a file path or dataframe.
25+
quality_scores_column: Name of the column with quality scores.
26+
output_type: Output type, either "polars.DataFrame" or "pandas.DataFrame".
27+
28+
Returns:
29+
DataFrame with base sequence quality statistics.
30+
"""
31+
ctx.set_option(
32+
"datafusion.execution.target_partitions", str(target_partitions), False
33+
)
34+
35+
if isinstance(df, (str, Path)):
36+
df = str(df)
37+
supported_exts = {".parquet", ".csv", ".bed", ".vcf", ".fastq"}
38+
ext = set(Path(df).suffixes)
39+
if not (supported_exts & ext or not ext):
40+
raise ValueError(
41+
"Input file must be a Parquet, CSV, BED, VCF, or FASTQ file."
42+
)
43+
result: datafusion.DataFrame = base_sequance_quality_scan(
44+
ctx, df, quality_scores_column
45+
)
46+
else:
47+
if isinstance(df, pl.LazyFrame):
48+
arrow_table = df.collect().to_arrow()
49+
elif isinstance(df, pl.DataFrame):
50+
arrow_table = df.to_arrow()
51+
elif isinstance(df, pd.DataFrame):
52+
arrow_table = pa.Table.from_pandas(df)
53+
else:
54+
raise TypeError("Unsupported dataframe type.")
55+
result: datafusion.DataFrame = base_sequance_quality_frame(
56+
ctx, arrow_table, quality_scores_column
57+
)
58+
59+
if output_type == "polars.DataFrame":
60+
return result.to_polars()
61+
elif output_type == "pandas.DataFrame":
62+
return result.to_pandas()
63+
else:
64+
raise ValueError("output_type must be 'polars.DataFrame' or 'pandas.DataFrame'")

0 commit comments

Comments
 (0)