-
Notifications
You must be signed in to change notification settings - Fork 1.6k
speedup date_trunc
(~7x faster) in some cases
#16859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,7 @@ use arrow::array::types::{ | |
ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType, | ||
TimestampNanosecondType, TimestampSecondType, | ||
}; | ||
use arrow::array::{Array, PrimitiveArray}; | ||
use arrow::array::{Array, ArrayRef, Int64Array, PrimitiveArray}; | ||
use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View}; | ||
use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second}; | ||
use datafusion_common::cast::as_primitive_array; | ||
|
@@ -60,6 +60,8 @@ use chrono::{ | |
- hour / HOUR | ||
- minute / MINUTE | ||
- second / SECOND | ||
- millisecond / MILLISECOND | ||
- microsecond / MICROSECOND | ||
"# | ||
), | ||
argument( | ||
|
@@ -185,6 +187,26 @@ impl ScalarUDFImpl for DateTruncFunc { | |
) -> Result<ColumnarValue> { | ||
let parsed_tz = parse_tz(tz_opt)?; | ||
let array = as_primitive_array::<T>(array)?; | ||
|
||
// fast path for fine granularities | ||
if matches!( | ||
granularity.as_str(), | ||
// For morden timezones, it's correct to truncate "minute" in this way. | ||
// Both datafusion and arrow are ignoring historical timezone's non-minute granularity | ||
// bias (e.g., Asia/Kathmandu before 1919 is UTC+05:41:16). | ||
"second" | "minute" | "millisecond" | "microsecond" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add some comments in 346ae59 |
||
) || | ||
// In UTC, "hour" and "day" have uniform durations and can be truncated with simple arithmetic | ||
(parsed_tz.is_none() && matches!(granularity.as_str(), "hour" | "day")) | ||
{ | ||
let result = general_date_trunc_array_fine_granularity( | ||
T::UNIT, | ||
array, | ||
granularity.as_str(), | ||
)?; | ||
return Ok(ColumnarValue::Array(result)); | ||
} | ||
|
||
let array: PrimitiveArray<T> = array | ||
.try_unary(|x| { | ||
general_date_trunc(T::UNIT, x, parsed_tz, granularity.as_str()) | ||
|
@@ -423,6 +445,55 @@ fn date_trunc_coarse(granularity: &str, value: i64, tz: Option<Tz>) -> Result<i6 | |
Ok(value.unwrap()) | ||
} | ||
|
||
/// Fast path for fine granularities (hour and smaller) that can be handled | ||
/// with simple arithmetic operations without calendar complexity. | ||
/// | ||
/// This function is timezone-agnostic and should only be used when: | ||
/// - No timezone is specified in the input, OR | ||
/// - The granularity is less than hour as hour can be affected by DST transitions in some cases | ||
fn general_date_trunc_array_fine_granularity<T: ArrowTimestampType>( | ||
tu: TimeUnit, | ||
array: &PrimitiveArray<T>, | ||
granularity: &str, | ||
) -> Result<ArrayRef> { | ||
let unit = match (tu, granularity) { | ||
(Second, "minute") => Some(Int64Array::new_scalar(60)), | ||
(Second, "hour") => Some(Int64Array::new_scalar(3600)), | ||
(Second, "day") => Some(Int64Array::new_scalar(86400)), | ||
|
||
(Millisecond, "second") => Some(Int64Array::new_scalar(1_000)), | ||
(Millisecond, "minute") => Some(Int64Array::new_scalar(60_000)), | ||
(Millisecond, "hour") => Some(Int64Array::new_scalar(3_600_000)), | ||
(Millisecond, "day") => Some(Int64Array::new_scalar(86_400_000)), | ||
|
||
(Microsecond, "millisecond") => Some(Int64Array::new_scalar(1_000)), | ||
(Microsecond, "second") => Some(Int64Array::new_scalar(1_000_000)), | ||
(Microsecond, "minute") => Some(Int64Array::new_scalar(60_000_000)), | ||
(Microsecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000)), | ||
(Microsecond, "day") => Some(Int64Array::new_scalar(86_400_000_000)), | ||
|
||
(Nanosecond, "microsecond") => Some(Int64Array::new_scalar(1_000)), | ||
(Nanosecond, "millisecond") => Some(Int64Array::new_scalar(1_000_000)), | ||
(Nanosecond, "second") => Some(Int64Array::new_scalar(1_000_000_000)), | ||
(Nanosecond, "minute") => Some(Int64Array::new_scalar(60_000_000_000)), | ||
(Nanosecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000_000)), | ||
(Nanosecond, "day") => Some(Int64Array::new_scalar(86_400_000_000_000)), | ||
_ => None, | ||
}; | ||
|
||
if let Some(unit) = unit { | ||
let original_type = array.data_type(); | ||
let array = arrow::compute::cast(array, &DataType::Int64)?; | ||
let array = arrow::compute::kernels::numeric::div(&array, &unit)?; | ||
let array = arrow::compute::kernels::numeric::mul(&array, &unit)?; | ||
let array = arrow::compute::cast(&array, original_type)?; | ||
Ok(array) | ||
} else { | ||
// truncate to the same or smaller unit | ||
Ok(Arc::new(array.clone())) | ||
} | ||
} | ||
|
||
// truncates a single value with the given timeunit to the specified granularity | ||
fn general_date_trunc( | ||
tu: TimeUnit, | ||
|
@@ -884,6 +955,21 @@ mod tests { | |
"2018-11-04T02:00:00-02", | ||
], | ||
), | ||
( | ||
vec![ | ||
"2024-10-26T23:30:00Z", | ||
"2024-10-27T00:30:00Z", | ||
"2024-10-27T01:30:00Z", | ||
"2024-10-27T02:30:00Z", | ||
], | ||
Some("Asia/Kathmandu".into()), // UTC+5:45 | ||
vec![ | ||
"2024-10-27T05:00:00+05:45", | ||
"2024-10-27T06:00:00+05:45", | ||
"2024-10-27T07:00:00+05:45", | ||
"2024-10-27T08:00:00+05:45", | ||
], | ||
), | ||
]; | ||
|
||
cases.iter().for_each(|(original, tz_opt, expected)| { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
morden -> modern?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fix them all 😎 #17135