-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Support API for "pre-image" for pruning predicate evaluation #19722
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
d94889a
4aa7f4e
2329c12
7ac8325
7a3e8b3
fbd5dcc
d920735
5ffb704
2fdc14c
c2b0cd3
a0b6564
0a24d60
59235de
9ae434e
9f845e7
510b5bc
b9f5c2c
ec8cc7e
47a18dc
01b254b
d8b4f0f
116d6e2
c0ed63c
5856150
c53a9fc
9b32843
53f72ed
ba5be8a
46a941f
798d88f
fb155f6
a070246
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,7 +39,7 @@ use datafusion_common::{ | |
| }; | ||
| use datafusion_expr::{ | ||
| BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, | ||
| binary::BinaryTypeCoercer, lit, or, | ||
| binary::BinaryTypeCoercer, interval_arithmetic::Interval, lit, or, | ||
| }; | ||
| use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult}; | ||
| use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; | ||
|
|
@@ -51,14 +51,17 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP | |
|
|
||
| use super::inlist_simplifier::ShortenInListSimplifier; | ||
| use super::utils::*; | ||
| use crate::analyzer::type_coercion::TypeCoercionRewriter; | ||
| use crate::simplify_expressions::SimplifyContext; | ||
| use crate::simplify_expressions::regex::simplify_regex_expr; | ||
| use crate::simplify_expressions::unwrap_cast::{ | ||
| is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary, | ||
| is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist, | ||
| unwrap_cast_in_comparison_for_binary, | ||
| }; | ||
| use crate::{ | ||
| analyzer::type_coercion::TypeCoercionRewriter, | ||
| simplify_expressions::udf_preimage::rewrite_with_preimage, | ||
| }; | ||
| use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map; | ||
| use datafusion_expr_common::casts::try_cast_literal_to_type; | ||
| use indexmap::IndexSet; | ||
|
|
@@ -1952,12 +1955,98 @@ impl TreeNodeRewriter for Simplifier<'_> { | |
| })) | ||
| } | ||
|
|
||
| // ======================================= | ||
sdf-jkl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // preimage_in_comparison | ||
| // ======================================= | ||
| // | ||
| // For case: | ||
| // date_part(expr as 'YEAR') op literal | ||
| // | ||
| // Background: | ||
| // Datasources such as Parquet can prune partitions using simple predicates, | ||
| // but they cannot do so for complex expressions. | ||
| // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. | ||
| // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. | ||
| // NOTE: we only consider immutable UDFs with literal RHS values | ||
| Expr::BinaryExpr(BinaryExpr { left, op, right }) => { | ||
| use datafusion_expr::Operator::*; | ||
| let is_preimage_op = matches!( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it might be nice (as a follow on PR) to mention this list in the docs for preimage -- e.g. that it only applies to predicates |
||
| op, | ||
| Eq | NotEq | ||
| | Lt | ||
| | LtEq | ||
| | Gt | ||
| | GtEq | ||
| | IsDistinctFrom | ||
| | IsNotDistinctFrom | ||
| ); | ||
| if !is_preimage_op { | ||
| return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { | ||
| left, | ||
| op, | ||
| right, | ||
| }))); | ||
| } | ||
|
|
||
| if let (Some(interval), Some(col_expr)) = | ||
| get_preimage(left.as_ref(), right.as_ref(), info)? | ||
| { | ||
| rewrite_with_preimage(info, interval, op, Box::new(col_expr))? | ||
| } else if let Some(swapped) = op.swap() { | ||
| if let (Some(interval), Some(col_expr)) = | ||
| get_preimage(right.as_ref(), left.as_ref(), info)? | ||
| { | ||
| rewrite_with_preimage( | ||
| info, | ||
| interval, | ||
| swapped, | ||
| Box::new(col_expr), | ||
| )? | ||
| } else { | ||
| Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) | ||
| } | ||
| } else { | ||
| Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) | ||
| } | ||
| } | ||
|
|
||
| // no additional rewrites possible | ||
| expr => Transformed::no(expr), | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| fn get_preimage( | ||
| left_expr: &Expr, | ||
| right_expr: &Expr, | ||
| info: &SimplifyContext, | ||
| ) -> Result<(Option<Interval>, Option<Expr>)> { | ||
| let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else { | ||
| return Ok((None, None)); | ||
| }; | ||
| if !is_literal_or_literal_cast(right_expr) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if there is a reason to limit this to literal ? It seems like the call to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is still an open question, but it is ok to handle as a follow on PR (aka widen the expressions)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you have an example where we could use a non-literal
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking something like extracting the year from a computed value. For example, if we had a table with a base date and an interval, it seems like we could do something like): WHERE 2025 = date_part(YEAR, t.base_date + t.interval)rewrite to WHERE (t.base_date + t.interval) >= 2025-01-01 && (t.base_date + t.interval) < 2026-01-01However, in this case I agree there is a tradeoff that this actually might be worse to optimize (take longer to evaluate) 🤔
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should already work. The |
||
| return Ok((None, None)); | ||
| } | ||
| if func.signature().volatility != Volatility::Immutable { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also for a follow on PR, I think it would be safe to rewrite stable functions (whose values don't change during the statement) |
||
| return Ok((None, None)); | ||
| } | ||
| Ok(( | ||
| func.preimage(args, right_expr, info)?, | ||
| func.column_expr(args), | ||
| )) | ||
| } | ||
|
|
||
| fn is_literal_or_literal_cast(expr: &Expr) -> bool { | ||
| match expr { | ||
| Expr::Literal(_, _) => true, | ||
| Expr::Cast(Cast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)), | ||
| Expr::TryCast(TryCast { expr, .. }) => { | ||
| matches!(expr.as_ref(), Expr::Literal(_, _)) | ||
| } | ||
| _ => false, | ||
| } | ||
| } | ||
|
|
||
| fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> { | ||
| match expr { | ||
| Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,114 @@ | ||||||
| // Licensed to the Apache Software Foundation (ASF) under one | ||||||
| // or more contributor license agreements. See the NOTICE file | ||||||
| // distributed with this work for additional information | ||||||
| // regarding copyright ownership. The ASF licenses this file | ||||||
| // to you under the Apache License, Version 2.0 (the | ||||||
| // "License"); you may not use this file except in compliance | ||||||
| // with the License. You may obtain a copy of the License at | ||||||
| // | ||||||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||||||
| // | ||||||
| // Unless required by applicable law or agreed to in writing, | ||||||
| // software distributed under the License is distributed on an | ||||||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||||||
| // KIND, either express or implied. See the License for the | ||||||
| // specific language governing permissions and limitations | ||||||
| // under the License. | ||||||
|
|
||||||
| use datafusion_common::{Result, internal_err, tree_node::Transformed}; | ||||||
| use datafusion_expr::{ | ||||||
| BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext, | ||||||
| }; | ||||||
| use datafusion_expr_common::interval_arithmetic::Interval; | ||||||
|
|
||||||
| /// Rewrites a binary expression using its "preimage" | ||||||
| /// | ||||||
| /// Specifically it rewrites expressions of the form `<expr> OP x` (e.g. `<expr> = | ||||||
| /// x`) where `<expr>` is known to have a pre-image (aka the entire single | ||||||
| /// range for which it is valid) | ||||||
| /// | ||||||
| /// This rewrite is described in the [ClickHouse Paper] and is particularly | ||||||
sdf-jkl marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| /// useful for simplifying expressions `date_part` or equivalent functions. The | ||||||
| /// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you | ||||||
| /// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates | ||||||
| /// covering the entire year of 2024. Thus, you can rewrite the expression to `k | ||||||
| /// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. | ||||||
| /// | ||||||
| /// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf | ||||||
| /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image | ||||||
| /// | ||||||
| pub(super) fn rewrite_with_preimage( | ||||||
sdf-jkl marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| _info: &SimplifyContext, | ||||||
|
||||||
| preimage_interval: Interval, | ||||||
| op: Operator, | ||||||
| expr: Box<Expr>, | ||||||
| ) -> Result<Transformed<Expr>> { | ||||||
| let (lower, upper) = preimage_interval.into_bounds(); | ||||||
| let (lower, upper) = (lit(lower), lit(upper)); | ||||||
|
|
||||||
| let rewritten_expr = match op { | ||||||
| // <expr> < x ==> <expr> < lower | ||||||
| // <expr> >= x ==> <expr> >= lower | ||||||
sdf-jkl marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { | ||||||
sdf-jkl marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| left: expr, | ||||||
| op, | ||||||
| right: Box::new(lower), | ||||||
| }), | ||||||
| // <expr> > x ==> <expr> >= upper | ||||||
| Operator::Gt => Expr::BinaryExpr(BinaryExpr { | ||||||
| left: expr, | ||||||
| op: Operator::GtEq, | ||||||
| right: Box::new(upper), | ||||||
| }), | ||||||
| // <expr> <= x ==> <expr> < upper | ||||||
| Operator::LtEq => Expr::BinaryExpr(BinaryExpr { | ||||||
| left: expr, | ||||||
| op: Operator::Lt, | ||||||
| right: Box::new(upper), | ||||||
| }), | ||||||
| // <expr> = x ==> (<expr> >= lower) and (<expr> < upper) | ||||||
sdf-jkl marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| // | ||||||
| // <expr> is not distinct from x ==> (<expr> is NULL and x is NULL) or ((<expr> >= lower) and (<expr> < upper)) | ||||||
|
||||||
| // <expr> is not distinct from x ==> (<expr> is NULL and x is NULL) or ((<expr> >= lower) and (<expr> < upper)) | |
| // <expr> is not distinct from x ==> (<expr> is NULL) or ((<expr> >= lower) and (<expr> < upper)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure this IS NOT DISTICNT rewrite is correctas it is rewritten to just the range predicate. If expr is NULL and the literal is non-NULL, the original expression is FALSE, but the rewrite evaluates to NULL (x >= lower AND x < upper), which is not equivalent and violates the “same nullability” expectation for simplified expressions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@alamb In a WHERE clause, both FALSE and NULL might behave similarly (both filter out the row), so here may be safety?
If we want to keep false:
Operator::IsNotDistinctFrom => {
// expr IS NOT DISTINCT FROM x => must return FALSE if expr is NULL
// because we know x is NOT NULL.
expr.clone().is_not_null().and(
and(expr.clone().gt_eq(lower), expr.lt(upper))
)
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@xudong963 this solves the issue. Thanks!
Uh oh!
There was an error while loading. Please reload this page.