Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
d94889a
Add udf_preimage logic
sdf-jkl Jan 9, 2026
4aa7f4e
Cargo fmt
sdf-jkl Jan 10, 2026
2329c12
Fix err in rewrite_with_preimage
sdf-jkl Jan 10, 2026
7ac8325
Rewrite the preimage_in_comparison
sdf-jkl Jan 10, 2026
7a3e8b3
cargo fmt
sdf-jkl Jan 10, 2026
fbd5dcc
Fix ci
sdf-jkl Jan 10, 2026
d920735
Fix GtEq, Lt logic
sdf-jkl Jan 10, 2026
5ffb704
Merge branch 'main' into smaller-preimage-pr-1
sdf-jkl Jan 10, 2026
2fdc14c
Merge branch 'main' of https://github.com/apache/datafusion into smal…
sdf-jkl Jan 18, 2026
c2b0cd3
Replace BinaryExpression with binary_expr() fn
sdf-jkl Jan 18, 2026
a0b6564
Add unit tests + add doc part about upper bound
sdf-jkl Jan 19, 2026
0a24d60
Fix docs
sdf-jkl Jan 19, 2026
59235de
clippy
alamb Jan 19, 2026
9ae434e
Merge remote-tracking branch 'apache/main' into smaller-preimage-pr-1
alamb Jan 19, 2026
9f845e7
Make test field nullable
sdf-jkl Jan 19, 2026
510b5bc
Add tests for additional cases
alamb Jan 20, 2026
b9f5c2c
simplify
alamb Jan 20, 2026
ec8cc7e
Simplfy
alamb Jan 20, 2026
47a18dc
Merge pull request #1 from alamb/alamb/more_tests
sdf-jkl Jan 20, 2026
01b254b
Add rhs Null guard
sdf-jkl Jan 20, 2026
d8b4f0f
Fix comment
sdf-jkl Jan 20, 2026
116d6e2
Update API
sdf-jkl Jan 20, 2026
c0ed63c
clippy
sdf-jkl Jan 20, 2026
5856150
Fix null handling unit test
sdf-jkl Jan 20, 2026
c53a9fc
Fix null handling test
sdf-jkl Jan 20, 2026
9b32843
Update datafusion/expr/src/preimage.rs
sdf-jkl Jan 21, 2026
53f72ed
Fix docs
sdf-jkl Jan 21, 2026
ba5be8a
Fix comment
sdf-jkl Jan 21, 2026
46a941f
Fix is_not_distinct_from rewrite
sdf-jkl Jan 21, 2026
798d88f
Merge remote-tracking branch 'apache/main' into smaller-preimage-pr-1
alamb Jan 22, 2026
fb155f6
Simplify the API
alamb Jan 22, 2026
a070246
Merge branch 'smaller-preimage-pr-1' of https://github.com/sdf-jkl/da…
alamb Jan 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions datafusion/expr/src/udf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,25 @@ impl ScalarUDF {
self.inner.is_nullable(args, schema)
}

/// Return a preimage
///
/// See [`ScalarUDFImpl::preimage`] for more details.
pub fn preimage(
&self,
args: &[Expr],
lit_expr: &Expr,
info: &SimplifyContext,
) -> Result<Option<Interval>> {
self.inner.preimage(args, lit_expr, info)
}

/// Return inner column from function args
///
/// See [`ScalarUDFImpl::column_expr`]
pub fn column_expr(&self, args: &[Expr]) -> Option<Expr> {
self.inner.column_expr(args)
}

/// Invoke the function on `args`, returning the appropriate result.
///
/// See [`ScalarUDFImpl::invoke_with_args`] for details.
Expand Down Expand Up @@ -696,6 +715,36 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync {
Ok(ExprSimplifyResult::Original(args))
}

/// Returns the [preimage] for this function and the specified scalar value, if any.
///
/// A preimage is a single contiguous [`Interval`] of values where the function
/// will always return `lit_value`
///
/// This rewrite is described in the [ClickHouse Paper] and is particularly
/// useful for simplifying expressions `date_part` or equivalent functions. The
/// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you
/// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates
/// covering the entire year of 2024. Thus, you can rewrite the expression to `k
/// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable.
///
/// This should only return a preimage if the function takes a single argument
///
/// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf
/// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image
fn preimage(
&self,
_args: &[Expr],
_lit_expr: &Expr,
_info: &SimplifyContext,
) -> Result<Option<Interval>> {
Ok(None)
}

// Return the inner column expression from this function
fn column_expr(&self, _args: &[Expr]) -> Option<Expr> {
None
}

/// Returns true if some of this `exprs` subexpressions may not be evaluated
/// and thus any side effects (like divide by zero) may not be encountered.
///
Expand Down Expand Up @@ -926,6 +975,19 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
self.inner.simplify(args, info)
}

fn preimage(
&self,
args: &[Expr],
lit_expr: &Expr,
info: &SimplifyContext,
) -> Result<Option<Interval>> {
self.inner.preimage(args, lit_expr, info)
}

fn column_expr(&self, args: &[Expr]) -> Option<Expr> {
self.inner.column_expr(args)
}

fn conditional_arguments<'a>(
&self,
args: &'a [Expr],
Expand Down
93 changes: 91 additions & 2 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ use datafusion_common::{
};
use datafusion_expr::{
BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and,
binary::BinaryTypeCoercer, lit, or,
binary::BinaryTypeCoercer, interval_arithmetic::Interval, lit, or,
};
use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult};
use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval};
Expand All @@ -51,14 +51,17 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP

use super::inlist_simplifier::ShortenInListSimplifier;
use super::utils::*;
use crate::analyzer::type_coercion::TypeCoercionRewriter;
use crate::simplify_expressions::SimplifyContext;
use crate::simplify_expressions::regex::simplify_regex_expr;
use crate::simplify_expressions::unwrap_cast::{
is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary,
is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist,
unwrap_cast_in_comparison_for_binary,
};
use crate::{
analyzer::type_coercion::TypeCoercionRewriter,
simplify_expressions::udf_preimage::rewrite_with_preimage,
};
use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map;
use datafusion_expr_common::casts::try_cast_literal_to_type;
use indexmap::IndexSet;
Expand Down Expand Up @@ -1952,12 +1955,98 @@ impl TreeNodeRewriter for Simplifier<'_> {
}))
}

// =======================================
// preimage_in_comparison
// =======================================
//
// For case:
// date_part(expr as 'YEAR') op literal
//
// Background:
// Datasources such as Parquet can prune partitions using simple predicates,
// but they cannot do so for complex expressions.
// For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible.
// After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible.
// NOTE: we only consider immutable UDFs with literal RHS values
Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
use datafusion_expr::Operator::*;
let is_preimage_op = matches!(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might be nice (as a follow on PR) to mention this list in the docs for preimage -- e.g. that it only applies to predicates =, !=, ...

op,
Eq | NotEq
| Lt
| LtEq
| Gt
| GtEq
| IsDistinctFrom
| IsNotDistinctFrom
);
if !is_preimage_op {
return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
left,
op,
right,
})));
}

if let (Some(interval), Some(col_expr)) =
get_preimage(left.as_ref(), right.as_ref(), info)?
{
rewrite_with_preimage(info, interval, op, Box::new(col_expr))?
} else if let Some(swapped) = op.swap() {
if let (Some(interval), Some(col_expr)) =
get_preimage(right.as_ref(), left.as_ref(), info)?
{
rewrite_with_preimage(
info,
interval,
swapped,
Box::new(col_expr),
)?
} else {
Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))
}
} else {
Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))
}
}

// no additional rewrites possible
expr => Transformed::no(expr),
})
}
}

fn get_preimage(
left_expr: &Expr,
right_expr: &Expr,
info: &SimplifyContext,
) -> Result<(Option<Interval>, Option<Expr>)> {
let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else {
return Ok((None, None));
};
if !is_literal_or_literal_cast(right_expr) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if there is a reason to limit this to literal ? It seems like the call to pre_image could handle this (and basically return if it wasn't a literal)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is still an open question, but it is ok to handle as a follow on PR (aka widen the expressions)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have an example where we could use a non-literal expr on rhs for a comparison with preimage? I can't come up with one, but if there is, we could move expression matching into preimage impl

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking something like extracting the year from a computed value. For example, if we had a table with a base date and an interval, it seems like we could do something like):

WHERE 2025 = date_part(YEAR, t.base_date + t.interval)

rewrite to

WHERE (t.base_date + t.interval) >= 2025-01-01 && (t.base_date + t.interval) < 2026-01-01

However, in this case I agree there is a tradeoff that this actually might be worse to optimize (take longer to evaluate) 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should already work.

The is_literal_or_literal_cast checks the 2025 we are comparing to, not the expression inside the function(date_part(expr))

return Ok((None, None));
}
if func.signature().volatility != Volatility::Immutable {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also for a follow on PR, I think it would be safe to rewrite stable functions (whose values don't change during the statement)

return Ok((None, None));
}
Ok((
func.preimage(args, right_expr, info)?,
func.column_expr(args),
))
}

fn is_literal_or_literal_cast(expr: &Expr) -> bool {
match expr {
Expr::Literal(_, _) => true,
Expr::Cast(Cast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)),
Expr::TryCast(TryCast { expr, .. }) => {
matches!(expr.as_ref(), Expr::Literal(_, _))
}
_ => false,
}
}

fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
match expr {
Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)),
Expand Down
1 change: 1 addition & 0 deletions datafusion/optimizer/src/simplify_expressions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ mod regex;
pub mod simplify_exprs;
pub mod simplify_literal;
mod simplify_predicates;
mod udf_preimage;
mod unwrap_cast;
mod utils;

Expand Down
114 changes: 114 additions & 0 deletions datafusion/optimizer/src/simplify_expressions/udf_preimage.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use datafusion_common::{Result, internal_err, tree_node::Transformed};
use datafusion_expr::{
BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext,
};
use datafusion_expr_common::interval_arithmetic::Interval;

/// Rewrites a binary expression using its "preimage"
///
/// Specifically it rewrites expressions of the form `<expr> OP x` (e.g. `<expr> =
/// x`) where `<expr>` is known to have a pre-image (aka the entire single
/// range for which it is valid)
///
/// This rewrite is described in the [ClickHouse Paper] and is particularly
/// useful for simplifying expressions `date_part` or equivalent functions. The
/// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you
/// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates
/// covering the entire year of 2024. Thus, you can rewrite the expression to `k
/// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable.
///
/// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf
/// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image
///
pub(super) fn rewrite_with_preimage(
_info: &SimplifyContext,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this arg?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alamb mentioned that we should keep it in #18789 (comment), but it was a while ago.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is important to pass to ScalarUDFImpl::preimage but probably it can be removed from this method call

Since I want to merge this PR up from main anyways before merge, I'll clean it up too

preimage_interval: Interval,
op: Operator,
expr: Box<Expr>,
) -> Result<Transformed<Expr>> {
let (lower, upper) = preimage_interval.into_bounds();
let (lower, upper) = (lit(lower), lit(upper));

let rewritten_expr = match op {
// <expr> < x ==> <expr> < lower
// <expr> >= x ==> <expr> >= lower
Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr {
left: expr,
op,
right: Box::new(lower),
}),
// <expr> > x ==> <expr> >= upper
Operator::Gt => Expr::BinaryExpr(BinaryExpr {
left: expr,
op: Operator::GtEq,
right: Box::new(upper),
}),
// <expr> <= x ==> <expr> < upper
Operator::LtEq => Expr::BinaryExpr(BinaryExpr {
left: expr,
op: Operator::Lt,
right: Box::new(upper),
}),
// <expr> = x ==> (<expr> >= lower) and (<expr> < upper)
//
// <expr> is not distinct from x ==> (<expr> is NULL and x is NULL) or ((<expr> >= lower) and (<expr> < upper))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// <expr> is not distinct from x ==> (<expr> is NULL and x is NULL) or ((<expr> >= lower) and (<expr> < upper))
// <expr> is not distinct from x ==> (<expr> is NULL) or ((<expr> >= lower) and (<expr> < upper))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure this IS NOT DISTICNT rewrite is correctas it is rewritten to just the range predicate. If expr is NULL and the literal is non-NULL, the original expression is FALSE, but the rewrite evaluates to NULL (x >= lower AND x < upper), which is not equivalent and violates the “same nullability” expectation for simplified expressions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alamb In a WHERE clause, both FALSE and NULL might behave similarly (both filter out the row), so here may be safety?

If we want to keep false:

Operator::IsNotDistinctFrom => {
    // expr IS NOT DISTINCT FROM x => must return FALSE if expr is NULL
    // because we know x is NOT NULL.
    expr.clone().is_not_null().and(
        and(expr.clone().gt_eq(lower), expr.lt(upper))
    )
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@xudong963 this solves the issue. Thanks!

// but since x is always not NULL => (<expr> >= lower) and (<expr> < upper)
Operator::Eq | Operator::IsNotDistinctFrom => and(
Expr::BinaryExpr(BinaryExpr {
left: expr.clone(),
op: Operator::GtEq,
right: Box::new(lower),
}),
Expr::BinaryExpr(BinaryExpr {
left: expr,
op: Operator::Lt,
right: Box::new(upper),
}),
),
// <expr> != x ==> (<expr> < lower) or (<expr> >= upper)
Operator::NotEq => or(
Expr::BinaryExpr(BinaryExpr {
left: expr.clone(),
op: Operator::Lt,
right: Box::new(lower),
}),
Expr::BinaryExpr(BinaryExpr {
left: expr,
op: Operator::GtEq,
right: Box::new(upper),
}),
),
// <expr> is distinct from x ==> (<expr> < lower) or (<expr> >= upper) or (<expr> is NULL and x is not NULL) or (<expr> is not NULL and x is NULL)
// but given that x is always not NULL => (<expr> < lower) or (<expr> >= upper) or (<expr> is NULL)
Operator::IsDistinctFrom => Expr::BinaryExpr(BinaryExpr {
left: expr.clone(),
op: Operator::Lt,
right: Box::new(lower.clone()),
})
.or(Expr::BinaryExpr(BinaryExpr {
left: expr.clone(),
op: Operator::GtEq,
right: Box::new(upper),
}))
.or(expr.is_null()),
_ => return internal_err!("Expect comparison operators"),
};
Ok(Transformed::yes(rewritten_expr))
}