Skip to content

speedup date_trunc (~7x faster) in some cases #16859

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 87 additions & 1 deletion datafusion/functions/src/datetime/date_trunc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use arrow::array::types::{
ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType,
TimestampNanosecondType, TimestampSecondType,
};
use arrow::array::{Array, PrimitiveArray};
use arrow::array::{Array, ArrayRef, Int64Array, PrimitiveArray};
use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View};
use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second};
use datafusion_common::cast::as_primitive_array;
Expand Down Expand Up @@ -60,6 +60,8 @@ use chrono::{
- hour / HOUR
- minute / MINUTE
- second / SECOND
- millisecond / MILLISECOND
- microsecond / MICROSECOND
"#
),
argument(
Expand Down Expand Up @@ -185,6 +187,26 @@ impl ScalarUDFImpl for DateTruncFunc {
) -> Result<ColumnarValue> {
let parsed_tz = parse_tz(tz_opt)?;
let array = as_primitive_array::<T>(array)?;

// fast path for fine granularities
if matches!(
granularity.as_str(),
// For morden timezones, it's correct to truncate "minute" in this way.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

morden -> modern?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix them all 😎 #17135

// Both datafusion and arrow are ignoring historical timezone's non-minute granularity
// bias (e.g., Asia/Kathmandu before 1919 is UTC+05:41:16).
"second" | "minute" | "millisecond" | "microsecond"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minutes is correct for modern zones, but not historical dates.
The old code apparently has some issues with them too though (https://github.com/apache/datafusion/pull/16859/files#r2229547803)
Let's add a code comment about our conscious ignorance of historical zone offsets here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add some comments in 346ae59

) ||
// In UTC, "hour" and "day" have uniform durations and can be truncated with simple arithmetic
(parsed_tz.is_none() && matches!(granularity.as_str(), "hour" | "day"))
{
let result = general_date_trunc_array_fine_granularity(
T::UNIT,
array,
granularity.as_str(),
)?;
return Ok(ColumnarValue::Array(result));
}

let array: PrimitiveArray<T> = array
.try_unary(|x| {
general_date_trunc(T::UNIT, x, parsed_tz, granularity.as_str())
Expand Down Expand Up @@ -423,6 +445,55 @@ fn date_trunc_coarse(granularity: &str, value: i64, tz: Option<Tz>) -> Result<i6
Ok(value.unwrap())
}

/// Fast path for fine granularities (hour and smaller) that can be handled
/// with simple arithmetic operations without calendar complexity.
///
/// This function is timezone-agnostic and should only be used when:
/// - No timezone is specified in the input, OR
/// - The granularity is less than hour as hour can be affected by DST transitions in some cases
fn general_date_trunc_array_fine_granularity<T: ArrowTimestampType>(
tu: TimeUnit,
array: &PrimitiveArray<T>,
granularity: &str,
) -> Result<ArrayRef> {
let unit = match (tu, granularity) {
(Second, "minute") => Some(Int64Array::new_scalar(60)),
(Second, "hour") => Some(Int64Array::new_scalar(3600)),
(Second, "day") => Some(Int64Array::new_scalar(86400)),

(Millisecond, "second") => Some(Int64Array::new_scalar(1_000)),
(Millisecond, "minute") => Some(Int64Array::new_scalar(60_000)),
(Millisecond, "hour") => Some(Int64Array::new_scalar(3_600_000)),
(Millisecond, "day") => Some(Int64Array::new_scalar(86_400_000)),

(Microsecond, "millisecond") => Some(Int64Array::new_scalar(1_000)),
(Microsecond, "second") => Some(Int64Array::new_scalar(1_000_000)),
(Microsecond, "minute") => Some(Int64Array::new_scalar(60_000_000)),
(Microsecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000)),
(Microsecond, "day") => Some(Int64Array::new_scalar(86_400_000_000)),

(Nanosecond, "microsecond") => Some(Int64Array::new_scalar(1_000)),
(Nanosecond, "millisecond") => Some(Int64Array::new_scalar(1_000_000)),
(Nanosecond, "second") => Some(Int64Array::new_scalar(1_000_000_000)),
(Nanosecond, "minute") => Some(Int64Array::new_scalar(60_000_000_000)),
(Nanosecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000_000)),
(Nanosecond, "day") => Some(Int64Array::new_scalar(86_400_000_000_000)),
_ => None,
};

if let Some(unit) = unit {
let original_type = array.data_type();
let array = arrow::compute::cast(array, &DataType::Int64)?;
let array = arrow::compute::kernels::numeric::div(&array, &unit)?;
let array = arrow::compute::kernels::numeric::mul(&array, &unit)?;
let array = arrow::compute::cast(&array, original_type)?;
Ok(array)
} else {
// truncate to the same or smaller unit
Ok(Arc::new(array.clone()))
}
}

// truncates a single value with the given timeunit to the specified granularity
fn general_date_trunc(
tu: TimeUnit,
Expand Down Expand Up @@ -884,6 +955,21 @@ mod tests {
"2018-11-04T02:00:00-02",
],
),
(
vec![
"2024-10-26T23:30:00Z",
"2024-10-27T00:30:00Z",
"2024-10-27T01:30:00Z",
"2024-10-27T02:30:00Z",
],
Some("Asia/Kathmandu".into()), // UTC+5:45
vec![
"2024-10-27T05:00:00+05:45",
"2024-10-27T06:00:00+05:45",
"2024-10-27T07:00:00+05:45",
"2024-10-27T08:00:00+05:45",
],
),
];

cases.iter().for_each(|(original, tz_opt, expected)| {
Expand Down
2 changes: 2 additions & 0 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -2150,6 +2150,8 @@ date_trunc(precision, expression)
- hour / HOUR
- minute / MINUTE
- second / SECOND
- millisecond / MILLISECOND
- microsecond / MICROSECOND

- **expression**: Time expression to operate on. Can be a constant, column, or function.

Expand Down