diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index 8963ef77a53b..d3d52e237e15 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -28,7 +28,7 @@ use arrow::array::types::{ ArrowTimestampType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, }; -use arrow::array::{Array, PrimitiveArray}; +use arrow::array::{Array, ArrayRef, Int64Array, PrimitiveArray}; use arrow::datatypes::DataType::{self, Null, Timestamp, Utf8, Utf8View}; use arrow::datatypes::TimeUnit::{self, Microsecond, Millisecond, Nanosecond, Second}; use datafusion_common::cast::as_primitive_array; @@ -60,6 +60,8 @@ use chrono::{ - hour / HOUR - minute / MINUTE - second / SECOND + - millisecond / MILLISECOND + - microsecond / MICROSECOND "# ), argument( @@ -185,6 +187,26 @@ impl ScalarUDFImpl for DateTruncFunc { ) -> Result { let parsed_tz = parse_tz(tz_opt)?; let array = as_primitive_array::(array)?; + + // fast path for fine granularities + if matches!( + granularity.as_str(), + // For morden timezones, it's correct to truncate "minute" in this way. + // Both datafusion and arrow are ignoring historical timezone's non-minute granularity + // bias (e.g., Asia/Kathmandu before 1919 is UTC+05:41:16). + "second" | "minute" | "millisecond" | "microsecond" + ) || + // In UTC, "hour" and "day" have uniform durations and can be truncated with simple arithmetic + (parsed_tz.is_none() && matches!(granularity.as_str(), "hour" | "day")) + { + let result = general_date_trunc_array_fine_granularity( + T::UNIT, + array, + granularity.as_str(), + )?; + return Ok(ColumnarValue::Array(result)); + } + let array: PrimitiveArray = array .try_unary(|x| { general_date_trunc(T::UNIT, x, parsed_tz, granularity.as_str()) @@ -423,6 +445,55 @@ fn date_trunc_coarse(granularity: &str, value: i64, tz: Option) -> Result( + tu: TimeUnit, + array: &PrimitiveArray, + granularity: &str, +) -> Result { + let unit = match (tu, granularity) { + (Second, "minute") => Some(Int64Array::new_scalar(60)), + (Second, "hour") => Some(Int64Array::new_scalar(3600)), + (Second, "day") => Some(Int64Array::new_scalar(86400)), + + (Millisecond, "second") => Some(Int64Array::new_scalar(1_000)), + (Millisecond, "minute") => Some(Int64Array::new_scalar(60_000)), + (Millisecond, "hour") => Some(Int64Array::new_scalar(3_600_000)), + (Millisecond, "day") => Some(Int64Array::new_scalar(86_400_000)), + + (Microsecond, "millisecond") => Some(Int64Array::new_scalar(1_000)), + (Microsecond, "second") => Some(Int64Array::new_scalar(1_000_000)), + (Microsecond, "minute") => Some(Int64Array::new_scalar(60_000_000)), + (Microsecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000)), + (Microsecond, "day") => Some(Int64Array::new_scalar(86_400_000_000)), + + (Nanosecond, "microsecond") => Some(Int64Array::new_scalar(1_000)), + (Nanosecond, "millisecond") => Some(Int64Array::new_scalar(1_000_000)), + (Nanosecond, "second") => Some(Int64Array::new_scalar(1_000_000_000)), + (Nanosecond, "minute") => Some(Int64Array::new_scalar(60_000_000_000)), + (Nanosecond, "hour") => Some(Int64Array::new_scalar(3_600_000_000_000)), + (Nanosecond, "day") => Some(Int64Array::new_scalar(86_400_000_000_000)), + _ => None, + }; + + if let Some(unit) = unit { + let original_type = array.data_type(); + let array = arrow::compute::cast(array, &DataType::Int64)?; + let array = arrow::compute::kernels::numeric::div(&array, &unit)?; + let array = arrow::compute::kernels::numeric::mul(&array, &unit)?; + let array = arrow::compute::cast(&array, original_type)?; + Ok(array) + } else { + // truncate to the same or smaller unit + Ok(Arc::new(array.clone())) + } +} + // truncates a single value with the given timeunit to the specified granularity fn general_date_trunc( tu: TimeUnit, @@ -884,6 +955,21 @@ mod tests { "2018-11-04T02:00:00-02", ], ), + ( + vec![ + "2024-10-26T23:30:00Z", + "2024-10-27T00:30:00Z", + "2024-10-27T01:30:00Z", + "2024-10-27T02:30:00Z", + ], + Some("Asia/Kathmandu".into()), // UTC+5:45 + vec![ + "2024-10-27T05:00:00+05:45", + "2024-10-27T06:00:00+05:45", + "2024-10-27T07:00:00+05:45", + "2024-10-27T08:00:00+05:45", + ], + ), ]; cases.iter().for_each(|(original, tz_opt, expected)| { diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 7b4bb71d1c59..d49fc22dabb4 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2150,6 +2150,8 @@ date_trunc(precision, expression) - hour / HOUR - minute / MINUTE - second / SECOND + - millisecond / MILLISECOND + - microsecond / MICROSECOND - **expression**: Time expression to operate on. Can be a constant, column, or function.