Skip to content

Commit 66d97e0

Browse files
committed
Move target partition outside base sequence quality function
1 parent cace1a2 commit 66d97e0

File tree

2 files changed

+39
-24
lines changed

2 files changed

+39
-24
lines changed

docs/notebooks/base_sequence_quality.ipynb

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,20 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 17,
13+
"execution_count": 1,
1414
"id": "58b40aa6",
1515
"metadata": {},
16-
"outputs": [],
16+
"outputs": [
17+
{
18+
"name": "stderr",
19+
"output_type": "stream",
20+
"text": [
21+
"/home/jwinter/TBD/proj2/polars-bio/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
22+
" from .autonotebook import tqdm as notebook_tqdm\n",
23+
"INFO:polars_bio:Creating BioSessionContext\n"
24+
]
25+
}
26+
],
1727
"source": [
1828
"import pandas as pd\n",
1929
"\n",
@@ -28,6 +38,16 @@
2838
"### Usage examples"
2939
]
3040
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": 14,
44+
"id": "b0d81403",
45+
"metadata": {},
46+
"outputs": [],
47+
"source": [
48+
"pb.set_option(\"datafusion.execution.target_partitions\", \"2\")"
49+
]
50+
},
3151
{
3252
"cell_type": "markdown",
3353
"id": "b238193d",
@@ -38,33 +58,33 @@
3858
},
3959
{
4060
"cell_type": "code",
41-
"execution_count": 18,
61+
"execution_count": null,
4262
"id": "0420c240",
4363
"metadata": {},
4464
"outputs": [
4565
{
4666
"name": "stdout",
4767
"output_type": "stream",
4868
"text": [
49-
" pos avg q1 median q3 lower upper\n",
50-
"87 0 30.135 31.0 33.0 34.0 26.5 38.5\n",
51-
"66 1 31.210 31.0 34.0 34.0 26.5 38.5\n",
52-
"69 2 32.015 31.0 34.0 34.0 26.5 38.5\n",
53-
"45 3 35.690 35.0 37.0 37.0 32.0 40.0\n",
54-
"14 4 35.680 35.0 37.0 37.0 32.0 40.0\n",
55-
".. ... ... ... ... ... ... ...\n",
56-
"40 96 31.315 32.0 34.0 35.0 27.5 39.5\n",
57-
"23 97 30.670 31.0 34.0 35.0 25.0 41.0\n",
58-
"37 98 31.550 32.0 34.0 35.0 27.5 39.5\n",
59-
"6 99 31.250 32.0 34.0 35.0 27.5 39.5\n",
60-
"4 100 31.105 31.0 34.0 35.0 25.0 41.0\n",
69+
" pos avg q1 median q3 lower upper\n",
70+
"88 0 32.548723 31.0 34.0 34.0 26.5 38.5\n",
71+
"46 1 32.719772 31.0 34.0 34.0 26.5 38.5\n",
72+
"99 2 32.789697 31.0 34.0 34.0 26.5 38.5\n",
73+
"75 3 36.162011 37.0 37.0 37.0 37.0 37.0\n",
74+
"84 4 36.122733 37.0 37.0 37.0 37.0 37.0\n",
75+
".. ... ... ... ... ... ... ...\n",
76+
"19 96 32.998462 34.0 35.0 35.0 32.5 36.5\n",
77+
"64 97 32.922582 33.0 35.0 35.0 30.0 38.0\n",
78+
"70 98 32.883908 33.0 35.0 35.0 30.0 38.0\n",
79+
"80 99 32.836223 33.0 35.0 35.0 30.0 38.0\n",
80+
"51 100 31.190304 30.0 34.0 35.0 22.5 42.5\n",
6181
"\n",
6282
"[101 rows x 7 columns]\n"
6383
]
6484
}
6585
],
6686
"source": [
67-
"result = pb.base_sequence_quality(\"example.fastq\", output_type=\"pandas.DataFrame\", target_partitions=2).sort_values(by=\"pos\")\n",
87+
"result = pb.base_sequence_quality(\"example.fastq\", output_type=\"pandas.DataFrame\").sort_values(by=\"pos\")\n",
6888
"print(result)"
6989
]
7090
},
@@ -107,9 +127,9 @@
107127
"| -------------- | ----------------- | ----- |\n",
108128
"| fastqc-rs | - | 22.9s |\n",
109129
"| polars_bio | 1 | 9.0s |\n",
110-
"| polars_bio | 2 | 8.5s |\n",
111-
"| polars_bio | 4 | 15.6s |\n",
112-
"| polars_bio | 8 | 7.8s |\n",
130+
"| polars_bio | 2 | 7.8s |\n",
131+
"| polars_bio | 4 | 14.9s |\n",
132+
"| polars_bio | 8 | 7.4s |\n",
113133
"\n",
114134
"- The measured execution time is for the algorithm to run on file ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR194/ERR194147/ERR194147.fastq.gz, which contains about 8,240,000 records.\n",
115135
"- The `fastqc-rs` execution time applies only to the base sequence quality task (pieces of code relating to other tasks have been removed for the purpose of this comparison).\n"

polars_bio/quality_stats.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ def base_sequence_quality(
1515
df: Union[str, Path, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
1616
quality_scores_column: str = "quality_scores",
1717
output_type: str = "polars.DataFrame",
18-
target_partitions: int = 8,
1918
) -> Union[pl.DataFrame, pd.DataFrame]:
2019
"""
2120
Compute base sequence quality statistics from various dataframe/file types.
@@ -28,10 +27,6 @@ def base_sequence_quality(
2827
Returns:
2928
DataFrame with base sequence quality statistics.
3029
"""
31-
ctx.set_option(
32-
"datafusion.execution.target_partitions", str(target_partitions), False
33-
)
34-
3530
if isinstance(df, (str, Path)):
3631
df = str(df)
3732
supported_exts = {".parquet", ".csv", ".bed", ".vcf", ".fastq"}

0 commit comments

Comments
 (0)