Skip to content

Commit 9e67e06

Browse files
authored
Fixing inconsistent suffixes handling in overlaps (#237)
1 parent 7e2908b commit 9e67e06

File tree

4 files changed

+277
-102
lines changed

4 files changed

+277
-102
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

polars_bio/range_op_helpers.py

Lines changed: 30 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,28 @@
2323
pd = None
2424

2525

26+
def _generate_overlap_schema(
27+
df1_schema: pl.Schema,
28+
df2_schema: pl.Schema,
29+
range_options: RangeOptions,
30+
) -> pl.Schema:
31+
"""Generate schema for overlap operations with correct suffix handling."""
32+
coord_cols = set(range_options.columns_1 + range_options.columns_2)
33+
merged_schema_dict = {}
34+
35+
# Add df1 columns with appropriate suffixes
36+
for col_name, col_type in df1_schema.items():
37+
# All df1 columns get suffix _1
38+
merged_schema_dict[f"{col_name}{range_options.suffixes[0]}"] = col_type
39+
40+
# Add df2 columns with appropriate suffixes
41+
for col_name, col_type in df2_schema.items():
42+
# All df2 columns get suffix _2
43+
merged_schema_dict[f"{col_name}{range_options.suffixes[1]}"] = col_type
44+
45+
return pl.Schema(merged_schema_dict)
46+
47+
2648
def _lazyframe_to_parquet(
2749
df: Union[pl.LazyFrame, "GffLazyFrameWrapper"], ctx: BioSessionContext
2850
) -> str:
@@ -129,39 +151,10 @@ def range_operation(
129151
df_schema1_base = _get_schema(df1, ctx, None, read_options1)
130152
df_schema2_base = _get_schema(df2, ctx, None, read_options2)
131153

132-
# Generate the correct schema based on actual DataFusion behavior
133-
# Coordinate columns get correct suffixes, non-coordinate columns get swapped suffixes
134-
coord_cols = set(range_options.columns_1 + range_options.columns_2)
135-
136-
merged_schema_dict = {}
137-
138-
# Add df1 columns with appropriate suffixes
139-
for col_name, col_type in df_schema1_base.items():
140-
if col_name in coord_cols:
141-
# Coordinate columns get suffix _1
142-
merged_schema_dict[f"{col_name}{range_options.suffixes[0]}"] = (
143-
col_type
144-
)
145-
else:
146-
# Non-coordinate columns get suffix _2 (swapped behavior)
147-
merged_schema_dict[f"{col_name}{range_options.suffixes[1]}"] = (
148-
col_type
149-
)
150-
151-
# Add df2 columns with appropriate suffixes
152-
for col_name, col_type in df_schema2_base.items():
153-
if col_name in coord_cols:
154-
# Coordinate columns get suffix _2
155-
merged_schema_dict[f"{col_name}{range_options.suffixes[1]}"] = (
156-
col_type
157-
)
158-
else:
159-
# Non-coordinate columns get suffix _1 (swapped behavior)
160-
merged_schema_dict[f"{col_name}{range_options.suffixes[0]}"] = (
161-
col_type
162-
)
163-
164-
merged_schema = pl.Schema(merged_schema_dict)
154+
# Generate the correct schema using common function
155+
merged_schema = _generate_overlap_schema(
156+
df_schema1_base, df_schema2_base, range_options
157+
)
165158
# Nearest adds an extra computed column
166159
if range_options.range_op == RangeOp.Nearest:
167160
merged_schema = pl.Schema({**merged_schema, **{"distance": pl.Int64}})
@@ -220,39 +213,10 @@ def range_operation(
220213
df1_base_schema = _rename_columns(df1, "").schema
221214
df2_base_schema = _rename_columns(df2, "").schema
222215

223-
# Generate correct schema based on actual DataFusion behavior
224-
# Coordinate columns get correct suffixes, non-coordinate columns get swapped suffixes
225-
coord_cols = set(range_options.columns_1 + range_options.columns_2)
226-
227-
merged_schema_dict = {}
228-
229-
# Add df1 columns with appropriate suffixes
230-
for col_name, col_type in df1_base_schema.items():
231-
if col_name in coord_cols:
232-
# Coordinate columns get suffix _1
233-
merged_schema_dict[f"{col_name}{range_options.suffixes[0]}"] = (
234-
col_type
235-
)
236-
else:
237-
# Non-coordinate columns get suffix _2 (swapped behavior)
238-
merged_schema_dict[f"{col_name}{range_options.suffixes[1]}"] = (
239-
col_type
240-
)
241-
242-
# Add df2 columns with appropriate suffixes
243-
for col_name, col_type in df2_base_schema.items():
244-
if col_name in coord_cols:
245-
# Coordinate columns get suffix _2
246-
merged_schema_dict[f"{col_name}{range_options.suffixes[1]}"] = (
247-
col_type
248-
)
249-
else:
250-
# Non-coordinate columns get suffix _1 (swapped behavior)
251-
merged_schema_dict[f"{col_name}{range_options.suffixes[0]}"] = (
252-
col_type
253-
)
254-
255-
merged_schema = pl.Schema(merged_schema_dict)
216+
# Generate the correct schema using common function
217+
merged_schema = _generate_overlap_schema(
218+
df1_base_schema, df2_base_schema, range_options
219+
)
256220
# Add computed columns for streaming outputs
257221
if range_options.range_op == RangeOp.Nearest:
258222
merged_schema = pl.Schema({**merged_schema, **{"distance": pl.Int64}})

src/query.rs

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -107,40 +107,40 @@ pub(crate) fn overlap_query(query_params: QueryParams) -> String {
107107

108108
// Always include the required coordinate columns
109109
selected_columns.push(format!(
110-
"b.`{}` as `{}{}`",
111-
query_params.columns_2[0], query_params.columns_2[0], query_params.suffixes.0
110+
"a.`{}` as `{}{}`",
111+
query_params.columns_1[0], query_params.columns_1[0], query_params.suffixes.0
112112
));
113113
selected_columns.push(format!(
114-
"b.`{}` as `{}{}`",
115-
query_params.columns_2[1], query_params.columns_2[1], query_params.suffixes.0
114+
"a.`{}` as `{}{}`",
115+
query_params.columns_1[1], query_params.columns_1[1], query_params.suffixes.0
116116
));
117117
selected_columns.push(format!(
118-
"b.`{}` as `{}{}`",
119-
query_params.columns_2[2], query_params.columns_2[2], query_params.suffixes.0
118+
"a.`{}` as `{}{}`",
119+
query_params.columns_1[2], query_params.columns_1[2], query_params.suffixes.0
120120
));
121121
selected_columns.push(format!(
122-
"a.`{}` as `{}{}`",
123-
query_params.columns_1[0], query_params.columns_1[0], query_params.suffixes.1
122+
"b.`{}` as `{}{}`",
123+
query_params.columns_2[0], query_params.columns_2[0], query_params.suffixes.1
124124
));
125125
selected_columns.push(format!(
126-
"a.`{}` as `{}{}`",
127-
query_params.columns_1[1], query_params.columns_1[1], query_params.suffixes.1
126+
"b.`{}` as `{}{}`",
127+
query_params.columns_2[1], query_params.columns_2[1], query_params.suffixes.1
128128
));
129129
selected_columns.push(format!(
130-
"a.`{}` as `{}{}`",
131-
query_params.columns_1[2], query_params.columns_1[2], query_params.suffixes.1
130+
"b.`{}` as `{}{}`",
131+
query_params.columns_2[2], query_params.columns_2[2], query_params.suffixes.1
132132
));
133133

134134
// Add other columns only if they are in the projected columns
135-
for col in &query_params.other_columns_2 {
135+
for col in &query_params.other_columns_1 {
136136
if projected_cols.iter().any(|pc| pc.contains(col)) {
137137
selected_columns.push(format!(
138138
"a.`{}` as `{}{}`",
139139
col, col, query_params.suffixes.0
140140
));
141141
}
142142
}
143-
for col in &query_params.other_columns_1 {
143+
for col in &query_params.other_columns_2 {
144144
if projected_cols.iter().any(|pc| pc.contains(col)) {
145145
selected_columns.push(format!(
146146
"b.`{}` as `{}{}`",
@@ -153,38 +153,38 @@ pub(crate) fn overlap_query(query_params: QueryParams) -> String {
153153
} else {
154154
// Use original logic when no projection is specified
155155
format!(
156-
"b.`{}` as `{}{}`, -- contig
157-
b.`{}` as `{}{}`, -- pos_start
158-
b.`{}` as `{}{}`, -- pos_end
159-
a.`{}` as `{}{}`, -- contig
156+
"a.`{}` as `{}{}`, -- contig
160157
a.`{}` as `{}{}`, -- pos_start
161-
a.`{}` as `{}{}` -- pos_end
158+
a.`{}` as `{}{}`, -- pos_end
159+
b.`{}` as `{}{}`, -- contig
160+
b.`{}` as `{}{}`, -- pos_start
161+
b.`{}` as `{}{}` -- pos_end
162162
{}
163163
{}",
164-
query_params.columns_2[0],
165-
query_params.columns_2[0],
166-
query_params.suffixes.0,
167-
query_params.columns_2[1],
168-
query_params.columns_2[1],
169-
query_params.suffixes.0,
170-
query_params.columns_2[2],
171-
query_params.columns_2[2],
172-
query_params.suffixes.0,
173164
query_params.columns_1[0],
174165
query_params.columns_1[0],
175-
query_params.suffixes.1,
166+
query_params.suffixes.0,
176167
query_params.columns_1[1],
177168
query_params.columns_1[1],
178-
query_params.suffixes.1,
169+
query_params.suffixes.0,
179170
query_params.columns_1[2],
180171
query_params.columns_1[2],
172+
query_params.suffixes.0,
173+
query_params.columns_2[0],
174+
query_params.columns_2[0],
175+
query_params.suffixes.1,
176+
query_params.columns_2[1],
177+
query_params.columns_2[1],
178+
query_params.suffixes.1,
179+
query_params.columns_2[2],
180+
query_params.columns_2[2],
181181
query_params.suffixes.1,
182182
if !query_params.other_columns_2.is_empty() {
183183
",".to_string()
184184
+ &format_non_join_tables(
185185
query_params.other_columns_2.clone(),
186-
"a".to_string(),
187-
query_params.suffixes.0.clone(),
186+
"b".to_string(),
187+
query_params.suffixes.1.clone(),
188188
)
189189
} else {
190190
"".to_string()
@@ -193,8 +193,8 @@ pub(crate) fn overlap_query(query_params: QueryParams) -> String {
193193
",".to_string()
194194
+ &format_non_join_tables(
195195
query_params.other_columns_1.clone(),
196-
"b".to_string(),
197-
query_params.suffixes.1.clone(),
196+
"a".to_string(),
197+
query_params.suffixes.0.clone(),
198198
)
199199
} else {
200200
"".to_string()
@@ -207,7 +207,7 @@ pub(crate) fn overlap_query(query_params: QueryParams) -> String {
207207
SELECT
208208
{}
209209
FROM
210-
{} AS a, {} AS b
210+
{} AS b, {} AS a
211211
WHERE
212212
a.`{}`=b.`{}`
213213
AND

0 commit comments

Comments
 (0)