From 73f7128b62b2ae070960fa8efccdf2bb305b7276 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Sat, 22 Mar 2025 15:50:53 -0400 Subject: [PATCH 01/14] Format Date32 to string given timestamp specifiers --- datafusion/functions/src/datetime/to_char.rs | 64 +++++++++++++++++++- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 8b2e5ad87471..3e5f1551bff9 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use arrow::array::cast::AsArray; use arrow::array::{new_null_array, Array, ArrayRef, StringArray}; +use arrow::compute::cast; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{ Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8, @@ -210,7 +211,7 @@ fn _to_char_scalar( // of the implementation in arrow-rs we need to convert it to an array let data_type = &expression.data_type(); let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_)); - let array = expression.into_array(1)?; + let array = expression.clone().into_array(1)?; if format.is_none() { if is_scalar_expression { @@ -247,6 +248,10 @@ fn _to_char_scalar( )) } } else { + if data_type == &Date32 { + return _to_char_scalar(expression.clone().cast_to(&Date64, None)?, format); + } + exec_err!("{}", formatted.unwrap_err()) } } @@ -277,7 +282,25 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { let result = formatter.value(idx).try_to_string(); match result { Ok(value) => results.push(Some(value)), - Err(e) => return exec_err!("{}", e), + Err(e) => { + if data_type == &Date32 { + let format_options = match _build_format_options(&Date64, format) { + Ok(value) => value, + Err(value) => return value, + }; + + let array = cast(arrays[0].as_ref(), &Date64)?; + let formatter = + ArrayFormatter::try_new(array.as_ref(), &format_options)?; + let result = formatter.value(idx).try_to_string(); + match result { + Ok(value) => results.push(Some(value)), + Err(e) => return exec_err!("{}", e), + } + } else { + return exec_err!("{}", e); + } + } } } @@ -328,6 +351,11 @@ mod tests { ScalarValue::Utf8(Some("%Y::%m::%d".to_string())), "2020::09::01".to_string(), ), + ( + ScalarValue::Date32(Some(18506)), + ScalarValue::Utf8(Some("%Y::%m::%d %S::%M::%H %f".to_string())), + "2020::09::01 00::00::00 000000000".to_string(), + ), ( ScalarValue::Date64(Some(date.and_utc().timestamp_millis())), ScalarValue::Utf8(Some("%Y::%m::%d".to_string())), @@ -407,6 +435,11 @@ mod tests { StringArray::from(vec!["%Y::%m::%d".to_string()]), "2020::09::01".to_string(), ), + ( + ScalarValue::Date32(Some(18506)), + StringArray::from(vec!["%Y::%m::%d %S::%M::%H %f".to_string()]), + "2020::09::01 00::00::00 000000000".to_string(), + ), ( ScalarValue::Date64(Some(date.and_utc().timestamp_millis())), StringArray::from(vec!["%Y::%m::%d".to_string()]), @@ -490,6 +523,14 @@ mod tests { ScalarValue::Utf8(Some("%Y::%m::%d".to_string())), StringArray::from(vec!["2020::09::01", "2020::09::02"]), ), + ( + Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, + ScalarValue::Utf8(Some("%Y::%m::%d %S::%M::%H %f".to_string())), + StringArray::from(vec![ + "2020::09::01 00::00::00 000000000", + "2020::09::02 00::00::00 000000000", + ]), + ), ( Arc::new(Date64Array::from(vec![ date.and_utc().timestamp_millis(), @@ -506,6 +547,25 @@ mod tests { StringArray::from(vec!["%Y::%m::%d", "%d::%m::%Y"]), StringArray::from(vec!["2020::09::01", "02::09::2020"]), ), + ( + Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, + StringArray::from(vec![ + "%Y::%m::%d %S::%M::%H %f", + "%Y::%m::%d %S::%M::%H %f", + ]), + StringArray::from(vec![ + "2020::09::01 00::00::00 000000000", + "2020::09::02 00::00::00 000000000", + ]), + ), + ( + Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, + StringArray::from(vec!["%Y::%m::%d", "%Y::%m::%d %S::%M::%H %f"]), + StringArray::from(vec![ + "2020::09::01", + "2020::09::02 00::00::00 000000000", + ]), + ), ( Arc::new(Date64Array::from(vec![ date.and_utc().timestamp_millis(), From 3324c9466ad9e916cd3f1e3a4d4739d6ce31c7ef Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Wed, 26 Mar 2025 22:59:44 -0400 Subject: [PATCH 02/14] Eagerly cast Date32 to Date64 before formatting --- datafusion/functions/src/datetime/to_char.rs | 50 +++++++++----------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 3e5f1551bff9..869c4062a6aa 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -209,9 +209,9 @@ fn _to_char_scalar( ) -> Result { // it's possible that the expression is a scalar however because // of the implementation in arrow-rs we need to convert it to an array - let data_type = &expression.data_type(); + let mut data_type = &expression.data_type(); let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_)); - let array = expression.clone().into_array(1)?; + let mut array = expression.clone().into_array(1)?; if format.is_none() { if is_scalar_expression { @@ -221,6 +221,13 @@ fn _to_char_scalar( } } + // eagerly cast Date32 values to Date64 to support date formatting with time-related specifiers + // without error. + if data_type == &Date32 { + data_type = &Date64; + array = cast(array.as_ref(), data_type)?; + } + let format_options = match _build_format_options(data_type, format) { Ok(value) => value, Err(value) => return value, @@ -248,10 +255,6 @@ fn _to_char_scalar( )) } } else { - if data_type == &Date32 { - return _to_char_scalar(expression.clone().cast_to(&Date64, None)?, format); - } - exec_err!("{}", formatted.unwrap_err()) } } @@ -260,9 +263,18 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { let arrays = ColumnarValue::values_to_arrays(args)?; let mut results: Vec> = vec![]; let format_array = arrays[1].as_string::(); - let data_type = arrays[0].data_type(); - for idx in 0..arrays[0].len() { + let mut values = Arc::clone(&arrays[0]); + let mut data_type = arrays[0].data_type(); + + // eagerly cast Date32 values to Date64 to support date formatting with time-related specifiers + // without error. + if data_type == &Date32 { + data_type = &Date64; + values = cast(values.as_ref(), data_type)?; + } + + for idx in 0..values.len() { let format = if format_array.is_null(idx) { None } else { @@ -278,29 +290,11 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { }; // this isn't ideal but this can't use ValueFormatter as it isn't independent // from ArrayFormatter - let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; + let formatter = ArrayFormatter::try_new(values.as_ref(), &format_options)?; let result = formatter.value(idx).try_to_string(); match result { Ok(value) => results.push(Some(value)), - Err(e) => { - if data_type == &Date32 { - let format_options = match _build_format_options(&Date64, format) { - Ok(value) => value, - Err(value) => return value, - }; - - let array = cast(arrays[0].as_ref(), &Date64)?; - let formatter = - ArrayFormatter::try_new(array.as_ref(), &format_options)?; - let result = formatter.value(idx).try_to_string(); - match result { - Ok(value) => results.push(Some(value)), - Err(e) => return exec_err!("{}", e), - } - } else { - return exec_err!("{}", e); - } - } + Err(e) => return exec_err!("{}", e), } } From 6930caae8d8f2226d1ba563826130a90fa688841 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 27 Mar 2025 14:46:07 -0400 Subject: [PATCH 03/14] Remove superfluous clone --- datafusion/functions/src/datetime/to_char.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 869c4062a6aa..92f806b9dd66 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -211,7 +211,7 @@ fn _to_char_scalar( // of the implementation in arrow-rs we need to convert it to an array let mut data_type = &expression.data_type(); let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_)); - let mut array = expression.clone().into_array(1)?; + let mut array = expression.into_array(1)?; if format.is_none() { if is_scalar_expression { From deaabe49b682b9b108c2add6a091fd0b93340ea8 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Wed, 2 Apr 2025 21:31:42 +0000 Subject: [PATCH 04/14] POC for parsing format with StrftimeItems to determine if there are any time specifiers present. --- datafusion/functions/benches/to_char.rs | 65 ++++++++++- datafusion/functions/src/datetime/to_char.rs | 101 +++++++++++++----- .../sqllogictest/test_files/timestamps.slt | 12 +++ 3 files changed, 150 insertions(+), 28 deletions(-) diff --git a/datafusion/functions/benches/to_char.rs b/datafusion/functions/benches/to_char.rs index 6f20a20dc219..eefe172c8f13 100644 --- a/datafusion/functions/benches/to_char.rs +++ b/datafusion/functions/benches/to_char.rs @@ -80,8 +80,27 @@ fn patterns(rng: &mut ThreadRng) -> StringArray { StringArray::from(data) } +fn patterns_with_time_specifiers(rng: &mut ThreadRng) -> StringArray { + let samples = [ + "%Y:%m:%d %H:%M%S".to_string(), + "%Y:%m:%d %_H:%M%S".to_string(), + "%Y:%m:%d %k:%M%S".to_string(), + "%d-%m-%Y %I%P-%M-%S %f".to_string(), + "%d%m%Y %H".to_string(), + "%Y%m%d %M-%S %.3f".to_string(), + "%Y...%m...%d %T%3f".to_string(), + "%c".to_string(), + ]; + let mut data: Vec = vec![]; + for _ in 0..1000 { + data.push(samples.choose(rng).unwrap().to_string()); + } + + StringArray::from(data) +} + fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("to_char_array_array_1000", |b| { + c.bench_function("to_char_array_array_date_only_patterns_1000", |b| { let mut rng = rand::thread_rng(); let data_arr = data(&mut rng); let batch_len = data_arr.len(); @@ -101,7 +120,29 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("to_char_array_scalar_1000", |b| { + c.bench_function("to_char_array_array_datetime_patterns_1000", |b| { + let mut rng = rand::thread_rng(); + let data_arr = data(&mut rng); + let batch_len = data_arr.len(); + let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); + let patterns = ColumnarValue::Array(Arc::new(patterns_with_time_specifiers( + &mut rng, + )) as ArrayRef); + + b.iter(|| { + black_box( + to_char() + .invoke_with_args(ScalarFunctionArgs { + args: vec![data.clone(), patterns.clone()], + number_rows: batch_len, + return_type: &DataType::Utf8, + }) + .expect("to_char should work on valid values"), + ) + }) + }); + + c.bench_function("to_char_array_scalar_date_only_pattern_1000", |b| { let mut rng = rand::thread_rng(); let data_arr = data(&mut rng); let batch_len = data_arr.len(); @@ -122,6 +163,26 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); + c.bench_function("to_char_array_scalar_datetime_pattern_1000", |b| { + let mut rng = rand::thread_rng(); + let data_arr = data(&mut rng); + let batch_len = data_arr.len(); + let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); + let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some("%c".to_string()))); + + b.iter(|| { + black_box( + to_char() + .invoke_with_args(ScalarFunctionArgs { + args: vec![data.clone(), patterns.clone()], + number_rows: batch_len, + return_type: &DataType::Utf8, + }) + .expect("to_char should work on valid values"), + ) + }) + }); + c.bench_function("to_char_scalar_scalar_1000", |b| { let timestamp = "2026-07-08T09:10:11" .parse::() diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 92f806b9dd66..b9988ad81f5d 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::sync::Arc; use arrow::array::cast::AsArray; -use arrow::array::{new_null_array, Array, ArrayRef, StringArray}; +use arrow::array::{new_null_array, Array, ArrayRef, GenericStringArray, StringArray}; use arrow::compute::cast; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{ @@ -28,7 +28,7 @@ use arrow::datatypes::DataType::{ use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::error::ArrowError; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; - +use chrono::format::{Fixed, Item, Numeric, StrftimeItems}; use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ @@ -146,14 +146,14 @@ impl ScalarUDFImpl for ToCharFunc { match format { ColumnarValue::Scalar(ScalarValue::Utf8(None)) | ColumnarValue::Scalar(ScalarValue::Null) => { - _to_char_scalar(date_time.clone(), None) + to_char_scalar(date_time.clone(), None) } // constant format ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => { // invoke to_char_scalar with the known string, without converting to array - _to_char_scalar(date_time.clone(), Some(format)) + to_char_scalar(date_time.clone(), Some(format)) } - ColumnarValue::Array(_) => _to_char_array(&args), + ColumnarValue::Array(_) => to_char_array(&args), _ => { exec_err!( "Format for `to_char` must be non-null Utf8, received {:?}", @@ -171,7 +171,7 @@ impl ScalarUDFImpl for ToCharFunc { } } -fn _build_format_options<'a>( +fn build_format_options<'a>( data_type: &DataType, format: Option<&'a str>, ) -> Result, Result> { @@ -203,7 +203,7 @@ fn _build_format_options<'a>( } /// Special version when arg\[1] is a scalar -fn _to_char_scalar( +fn to_char_scalar( expression: ColumnarValue, format: Option<&str>, ) -> Result { @@ -221,14 +221,12 @@ fn _to_char_scalar( } } - // eagerly cast Date32 values to Date64 to support date formatting with time-related specifiers - // without error. - if data_type == &Date32 { + if data_type == &Date32 && has_time_specifier(format.unwrap()) { data_type = &Date64; array = cast(array.as_ref(), data_type)?; } - let format_options = match _build_format_options(data_type, format) { + let format_options = match build_format_options(data_type, format) { Ok(value) => value, Err(value) => return value, }; @@ -259,22 +257,21 @@ fn _to_char_scalar( } } -fn _to_char_array(args: &[ColumnarValue]) -> Result { - let arrays = ColumnarValue::values_to_arrays(args)?; - let mut results: Vec> = vec![]; - let format_array = arrays[1].as_string::(); - - let mut values = Arc::clone(&arrays[0]); - let mut data_type = arrays[0].data_type(); +fn to_char_array(args: &[ColumnarValue]) -> Result { + let mut arrays = ColumnarValue::values_to_arrays(args)?; + let (data, right) = arrays.split_at_mut(1); + let format_array = right[0].as_string::(); + let mut data_type = data[0].data_type(); - // eagerly cast Date32 values to Date64 to support date formatting with time-related specifiers - // without error. - if data_type == &Date32 { + if data_type == &Date32 && has_any_time_specifiers(format_array) { data_type = &Date64; - values = cast(values.as_ref(), data_type)?; + data[0] = cast(data[0].as_ref(), data_type)?; } - for idx in 0..values.len() { + let data = &data[0]; + let mut results: Vec> = vec![]; + + for idx in 0..data.len() { let format = if format_array.is_null(idx) { None } else { @@ -284,13 +281,13 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { results.push(None); continue; } - let format_options = match _build_format_options(data_type, format) { + let format_options = match build_format_options(data_type, format) { Ok(value) => value, Err(value) => return value, }; // this isn't ideal but this can't use ValueFormatter as it isn't independent - // from ArrayFormatter - let formatter = ArrayFormatter::try_new(values.as_ref(), &format_options)?; + // of ArrayFormatter + let formatter = ArrayFormatter::try_new(data.as_ref(), &format_options)?; let result = formatter.value(idx).try_to_string(); match result { Ok(value) => results.push(Some(value)), @@ -311,6 +308,58 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { } } +fn has_time_specifier(format: &str) -> bool { + let items = StrftimeItems::new(format); + for item in items { + match item { + Item::Fixed(v) => match v { + Fixed::LowerAmPm => return true, + Fixed::UpperAmPm => return true, + Fixed::RFC2822 => return true, + Fixed::RFC3339 => return true, + Fixed::Nanosecond => return true, + Fixed::Nanosecond3 => return true, + Fixed::Nanosecond6 => return true, + Fixed::Nanosecond9 => return true, + Fixed::TimezoneName => return true, + Fixed::TimezoneOffset => return true, + Fixed::TimezoneOffsetZ => return true, + Fixed::TimezoneOffsetColon => return true, + Fixed::TimezoneOffsetColonZ => return true, + Fixed::TimezoneOffsetDoubleColon => return true, + Fixed::TimezoneOffsetTripleColon => return true, + _ => (), + }, + Item::Numeric(v, _pad) => match v { + Numeric::Hour => return true, + Numeric::Hour12 => return true, + Numeric::Minute => return true, + Numeric::Second => return true, + Numeric::Nanosecond => return true, + Numeric::Timestamp => return true, + _ => (), + }, + _ => (), + } + } + + false +} + +fn has_any_time_specifiers(format_array: &GenericStringArray) -> bool { + for idx in 0..format_array.len() { + if format_array.is_null(idx) { + continue; + } + + if has_time_specifier(format_array.value(idx)) { + return true; + } + } + + false +} + #[cfg(test)] mod tests { use crate::datetime::to_char::ToCharFunc; diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 16168eeae4a1..8ffd7237ed5b 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -2794,6 +2794,18 @@ select date_format(dates, date_format) from formats; 01:01:2000 05:04:2003 +query T +select date_format(dates, time_format) from formats; +---- +00-00-00 +00::00::00 + +query T +select date_format(dates, timestamp_format) from formats; +---- +01:01:2000 00-00-00 +05:04:2003 00-00-00 + query T select to_char(times, time_format) from formats; ---- From 01b1bc09dffe52b6511f73d5f207b4f99506d21b Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Sun, 6 Apr 2025 12:17:29 -0400 Subject: [PATCH 05/14] Update benchmark code --- datafusion/functions/benches/to_char.rs | 141 ++++++++++++++++-------- 1 file changed, 96 insertions(+), 45 deletions(-) diff --git a/datafusion/functions/benches/to_char.rs b/datafusion/functions/benches/to_char.rs index eefe172c8f13..4ddfe2d65287 100644 --- a/datafusion/functions/benches/to_char.rs +++ b/datafusion/functions/benches/to_char.rs @@ -33,7 +33,7 @@ use datafusion_common::ScalarValue::TimestampNanosecond; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::datetime::to_char; -fn random_date_in_range( +fn pick_date_in_range( rng: &mut ThreadRng, start_date: NaiveDate, end_date: NaiveDate, @@ -43,7 +43,7 @@ fn random_date_in_range( start_date + TimeDelta::try_days(random_days).unwrap() } -fn data(rng: &mut ThreadRng) -> Date32Array { +fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array { let mut data: Vec = vec![]; let unix_days_from_ce = NaiveDate::from_ymd_opt(1970, 1, 1) .unwrap() @@ -56,7 +56,7 @@ fn data(rng: &mut ThreadRng) -> Date32Array { .expect("Date should parse"); for _ in 0..1000 { data.push( - random_date_in_range(rng, start_date, end_date).num_days_from_ce() + pick_date_in_range(rng, start_date, end_date).num_days_from_ce() - unix_days_from_ce, ); } @@ -64,48 +64,75 @@ fn data(rng: &mut ThreadRng) -> Date32Array { Date32Array::from(data) } -fn patterns(rng: &mut ThreadRng) -> StringArray { - let samples = [ - "%Y:%m:%d".to_string(), - "%d-%m-%Y".to_string(), - "%d%m%Y".to_string(), - "%Y%m%d".to_string(), - "%Y...%m...%d".to_string(), - ]; - let mut data: Vec = vec![]; - for _ in 0..1000 { - data.push(samples.choose(rng).unwrap().to_string()); - } +const DATE_PATTERNS: [&str; 5] = + ["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"]; - StringArray::from(data) +const DATETIME_PATTERNS: [&str; 8] = [ + "%Y:%m:%d %H:%M%S", + "%Y:%m:%d %_H:%M%S", + "%Y:%m:%d %k:%M%S", + "%d-%m-%Y %I%P-%M-%S %f", + "%d%m%Y %H", + "%Y%m%d %M-%S %.3f", + "%Y...%m...%d %T%3f", + "%c", +]; + +fn pick_date_pattern(rng: &mut ThreadRng) -> String { + DATE_PATTERNS + .choose(rng) + .expect("Empty list of date patterns") + .to_string() +} + +fn pick_date_time_pattern(rng: &mut ThreadRng) -> String { + DATETIME_PATTERNS + .choose(rng) + .expect("Empty list of date time patterns") + .to_string() } -fn patterns_with_time_specifiers(rng: &mut ThreadRng) -> StringArray { - let samples = [ - "%Y:%m:%d %H:%M%S".to_string(), - "%Y:%m:%d %_H:%M%S".to_string(), - "%Y:%m:%d %k:%M%S".to_string(), - "%d-%m-%Y %I%P-%M-%S %f".to_string(), - "%d%m%Y %H".to_string(), - "%Y%m%d %M-%S %.3f".to_string(), - "%Y...%m...%d %T%3f".to_string(), - "%c".to_string(), - ]; - let mut data: Vec = vec![]; +fn pick_date_and_date_time_mixed_pattern(rng: &mut ThreadRng) -> String { + match rng.gen_bool(0.5) { + true => pick_date_pattern(rng), + false => pick_date_time_pattern(rng), + } +} + +fn generate_pattern_array( + rng: &mut ThreadRng, + pick_fn: impl Fn(&mut ThreadRng) -> String, +) -> StringArray { + let mut data = Vec::with_capacity(1000); + for _ in 0..1000 { - data.push(samples.choose(rng).unwrap().to_string()); + data.push(pick_fn(rng)); } StringArray::from(data) } +fn generate_date_pattern_array(rng: &mut ThreadRng) -> StringArray { + generate_pattern_array(rng, pick_date_pattern) +} + +fn generate_datetime_pattern_array(rng: &mut ThreadRng) -> StringArray { + generate_pattern_array(rng, pick_date_time_pattern) +} + +fn generate_mixed_pattern_array(rng: &mut ThreadRng) -> StringArray { + generate_pattern_array(rng, pick_date_and_date_time_mixed_pattern) +} + fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("to_char_array_array_date_only_patterns_1000", |b| { + c.bench_function("to_char_array_date_only_patterns_1000", |b| { let mut rng = rand::thread_rng(); - let data_arr = data(&mut rng); + let data_arr = generate_date32_array(&mut rng); let batch_len = data_arr.len(); let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); - let patterns = ColumnarValue::Array(Arc::new(patterns(&mut rng)) as ArrayRef); + let patterns = ColumnarValue::Array(Arc::new(generate_date_pattern_array( + &mut rng, + )) as ArrayRef); b.iter(|| { black_box( @@ -120,12 +147,12 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("to_char_array_array_datetime_patterns_1000", |b| { + c.bench_function("to_char_array_datetime_patterns_1000", |b| { let mut rng = rand::thread_rng(); - let data_arr = data(&mut rng); + let data_arr = generate_date32_array(&mut rng); let batch_len = data_arr.len(); let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); - let patterns = ColumnarValue::Array(Arc::new(patterns_with_time_specifiers( + let patterns = ColumnarValue::Array(Arc::new(generate_datetime_pattern_array( &mut rng, )) as ArrayRef); @@ -142,13 +169,35 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("to_char_array_scalar_date_only_pattern_1000", |b| { + c.bench_function("to_char_array_mixed_patterns_1000", |b| { let mut rng = rand::thread_rng(); - let data_arr = data(&mut rng); + let data_arr = generate_date32_array(&mut rng); + let batch_len = data_arr.len(); + let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); + let patterns = ColumnarValue::Array(Arc::new(generate_mixed_pattern_array( + &mut rng, + )) as ArrayRef); + + b.iter(|| { + black_box( + to_char() + .invoke_with_args(ScalarFunctionArgs { + args: vec![data.clone(), patterns.clone()], + number_rows: batch_len, + return_type: &DataType::Utf8, + }) + .expect("to_char should work on valid values"), + ) + }) + }); + + c.bench_function("to_char_scalar_date_only_pattern_1000", |b| { + let mut rng = rand::thread_rng(); + let data_arr = generate_date32_array(&mut rng); let batch_len = data_arr.len(); let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); let patterns = - ColumnarValue::Scalar(ScalarValue::Utf8(Some("%Y-%m-%d".to_string()))); + ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng)))); b.iter(|| { black_box( @@ -163,12 +212,14 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("to_char_array_scalar_datetime_pattern_1000", |b| { + c.bench_function("to_char_scalar_datetime_pattern_1000", |b| { let mut rng = rand::thread_rng(); - let data_arr = data(&mut rng); + let data_arr = generate_date32_array(&mut rng); let batch_len = data_arr.len(); let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); - let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some("%c".to_string()))); + let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some( + pick_date_time_pattern(&mut rng), + ))); b.iter(|| { black_box( @@ -183,7 +234,8 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("to_char_scalar_scalar_1000", |b| { + c.bench_function("to_char_scalar_1000", |b| { + let mut rng = rand::thread_rng(); let timestamp = "2026-07-08T09:10:11" .parse::() .unwrap() @@ -193,9 +245,8 @@ fn criterion_benchmark(c: &mut Criterion) { .timestamp_nanos_opt() .unwrap(); let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp), None)); - let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some( - "%d-%m-%Y %H:%M:%S".to_string(), - ))); + let pattern = + ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng)))); b.iter(|| { black_box( From f906af55df1b9c9bbfa15513da2e8ca17580e783 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 10 Apr 2025 14:43:43 -0400 Subject: [PATCH 06/14] Stage initial selective retry --- datafusion/functions/src/datetime/to_char.rs | 158 ++++++++++--------- 1 file changed, 81 insertions(+), 77 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index b9988ad81f5d..985ac13a8376 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::sync::Arc; use arrow::array::cast::AsArray; -use arrow::array::{new_null_array, Array, ArrayRef, GenericStringArray, StringArray}; +use arrow::array::{new_null_array, Array, ArrayRef, StringArray}; use arrow::compute::cast; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{ @@ -28,7 +28,6 @@ use arrow::datatypes::DataType::{ use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::error::ArrowError; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; -use chrono::format::{Fixed, Item, Numeric, StrftimeItems}; use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ @@ -179,7 +178,9 @@ fn build_format_options<'a>( return Ok(FormatOptions::new()); }; let format_options = match data_type { - Date32 => FormatOptions::new().with_date_format(Some(format)), + Date32 => FormatOptions::new() + .with_date_format(Some(format)) + .with_datetime_format(Some(format)), Date64 => FormatOptions::new().with_datetime_format(Some(format)), Time32(_) => FormatOptions::new().with_time_format(Some(format)), Time64(_) => FormatOptions::new().with_time_format(Some(format)), @@ -209,21 +210,16 @@ fn to_char_scalar( ) -> Result { // it's possible that the expression is a scalar however because // of the implementation in arrow-rs we need to convert it to an array - let mut data_type = &expression.data_type(); + let data_type = &expression.data_type(); let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_)); - let mut array = expression.into_array(1)?; + let array = expression.clone().into_array(1)?; if format.is_none() { - if is_scalar_expression { - return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))); + return if is_scalar_expression { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))) } else { - return Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len()))); - } - } - - if data_type == &Date32 && has_time_specifier(format.unwrap()) { - data_type = &Date64; - array = cast(array.as_ref(), data_type)?; + Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len()))) + }; } let format_options = match build_format_options(data_type, format) { @@ -253,25 +249,23 @@ fn to_char_scalar( )) } } else { + // if the format attempt is with a Date32, it's possible the attempt failed because + // the format string contained time-specifiers, so we'll retry casting as Date64 + if data_type == &Date32 { + return to_char_scalar(expression.clone().cast_to(&Date64, None)?, format); + } + exec_err!("{}", formatted.unwrap_err()) } } fn to_char_array(args: &[ColumnarValue]) -> Result { - let mut arrays = ColumnarValue::values_to_arrays(args)?; - let (data, right) = arrays.split_at_mut(1); - let format_array = right[0].as_string::(); - let mut data_type = data[0].data_type(); - - if data_type == &Date32 && has_any_time_specifiers(format_array) { - data_type = &Date64; - data[0] = cast(data[0].as_ref(), data_type)?; - } - - let data = &data[0]; + let arrays = ColumnarValue::values_to_arrays(args)?; let mut results: Vec> = vec![]; + let format_array = arrays[1].as_string::(); + let data_type = arrays[0].data_type(); - for idx in 0..data.len() { + for idx in 0..arrays[0].len() { let format = if format_array.is_null(idx) { None } else { @@ -286,12 +280,30 @@ fn to_char_array(args: &[ColumnarValue]) -> Result { Err(value) => return value, }; // this isn't ideal but this can't use ValueFormatter as it isn't independent - // of ArrayFormatter - let formatter = ArrayFormatter::try_new(data.as_ref(), &format_options)?; + // from ArrayFormatter + let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; let result = formatter.value(idx).try_to_string(); match result { Ok(value) => results.push(Some(value)), - Err(e) => return exec_err!("{}", e), + Err(e) => { + // if the format attempt is with a Date32, it's possible the attempt failed because + // the format string contained time-specifiers, so we'll retry the specific failed Date32 as a Date64 + if data_type == &Date32 { + let failed_date_value = arrays[0].slice(idx, 1); + + match retry_date_as_timestamp(failed_date_value, &format_options) { + Ok(value) => { + results.push(Some(value)); + continue; + } + Err(e) => { + return exec_err!("{}", e); + } + } + } + + return exec_err!("{}", e); + } } } @@ -308,56 +320,17 @@ fn to_char_array(args: &[ColumnarValue]) -> Result { } } -fn has_time_specifier(format: &str) -> bool { - let items = StrftimeItems::new(format); - for item in items { - match item { - Item::Fixed(v) => match v { - Fixed::LowerAmPm => return true, - Fixed::UpperAmPm => return true, - Fixed::RFC2822 => return true, - Fixed::RFC3339 => return true, - Fixed::Nanosecond => return true, - Fixed::Nanosecond3 => return true, - Fixed::Nanosecond6 => return true, - Fixed::Nanosecond9 => return true, - Fixed::TimezoneName => return true, - Fixed::TimezoneOffset => return true, - Fixed::TimezoneOffsetZ => return true, - Fixed::TimezoneOffsetColon => return true, - Fixed::TimezoneOffsetColonZ => return true, - Fixed::TimezoneOffsetDoubleColon => return true, - Fixed::TimezoneOffsetTripleColon => return true, - _ => (), - }, - Item::Numeric(v, _pad) => match v { - Numeric::Hour => return true, - Numeric::Hour12 => return true, - Numeric::Minute => return true, - Numeric::Second => return true, - Numeric::Nanosecond => return true, - Numeric::Timestamp => return true, - _ => (), - }, - _ => (), - } - } - - false -} +fn retry_date_as_timestamp( + array_ref: ArrayRef, + format_options: &FormatOptions, +) -> Result { + let target_data_type = Date64; -fn has_any_time_specifiers(format_array: &GenericStringArray) -> bool { - for idx in 0..format_array.len() { - if format_array.is_null(idx) { - continue; - } + let date_value = cast(&array_ref, &target_data_type)?; + let formatter = ArrayFormatter::try_new(date_value.as_ref(), format_options)?; + let result = formatter.value(0).try_to_string()?; - if has_time_specifier(format_array.value(idx)) { - return true; - } - } - - false + Ok(result) } #[cfg(test)] @@ -375,6 +348,37 @@ mod tests { use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; use std::sync::Arc; + #[test] + fn test_array_array() { + let array_array_data = vec![( + Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, + StringArray::from(vec!["%Y::%m::%d", "%Y::%m::%d %S::%M::%H %f"]), + StringArray::from(vec!["2020::09::01", "2020::09::02 00::00::00 000000000"]), + )]; + + for (value, format, expected) in array_array_data { + let batch_len = value.len(); + let args = datafusion_expr::ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(value), + ColumnarValue::Array(Arc::new(format) as ArrayRef), + ], + number_rows: batch_len, + return_type: &DataType::Utf8, + }; + let result = ToCharFunc::new() + .invoke_with_args(args) + .expect("that to_char parsed values without error"); + + if let ColumnarValue::Array(result) = result { + assert_eq!(result.len(), 2); + assert_eq!(&expected as &dyn Array, result.as_ref()); + } else { + panic!("Expected an array value") + } + } + } + #[test] fn test_to_char() { let date = "2020-01-02T03:04:05" From d99c6377b52c5c761ab0734fb2a4f5542c3f1b6c Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Fri, 11 Apr 2025 10:04:31 -0400 Subject: [PATCH 07/14] Update comments --- datafusion/functions/src/datetime/to_char.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 985ac13a8376..7836e04c0c4d 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -249,8 +249,8 @@ fn to_char_scalar( )) } } else { - // if the format attempt is with a Date32, it's possible the attempt failed because - // the format string contained time-specifiers, so we'll retry casting as Date64 + // if the data type was a Date32, formatting could have failed because the format string + // contained datetime specifiers, so we'll retry by casting the date array as a timestamp array if data_type == &Date32 { return to_char_scalar(expression.clone().cast_to(&Date64, None)?, format); } @@ -286,8 +286,8 @@ fn to_char_array(args: &[ColumnarValue]) -> Result { match result { Ok(value) => results.push(Some(value)), Err(e) => { - // if the format attempt is with a Date32, it's possible the attempt failed because - // the format string contained time-specifiers, so we'll retry the specific failed Date32 as a Date64 + // if the data type was a Date32, formatting could have failed because the format string + // contained datetime specifiers, so we'll treat this specific date element as a timestamp if data_type == &Date32 { let failed_date_value = arrays[0].slice(idx, 1); From b379fe1b2927fc5c275670c70721df947a433c13 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:56:05 -0400 Subject: [PATCH 08/14] Keep track of consecutive Date32 failures --- datafusion/functions/src/datetime/to_char.rs | 135 +++++++++---------- 1 file changed, 65 insertions(+), 70 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 7836e04c0c4d..8715983934b3 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::sync::Arc; use arrow::array::cast::AsArray; -use arrow::array::{new_null_array, Array, ArrayRef, StringArray}; +use arrow::array::{new_null_array, Array, ArrayRef, GenericStringArray, StringArray}; use arrow::compute::cast; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{ @@ -28,7 +28,9 @@ use arrow::datatypes::DataType::{ use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::error::ArrowError; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; -use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue}; +use datafusion_common::{ + exec_datafusion_err, exec_err, utils::take_function_args, Result, ScalarValue, +}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD, @@ -259,13 +261,31 @@ fn to_char_scalar( } } -fn to_char_array(args: &[ColumnarValue]) -> Result { - let arrays = ColumnarValue::values_to_arrays(args)?; +fn to_char_array_inner( + values: ArrayRef, + format_array: &GenericStringArray, +) -> Result>, Result> { let mut results: Vec> = vec![]; - let format_array = arrays[1].as_string::(); - let data_type = arrays[0].data_type(); - for idx in 0..arrays[0].len() { + let data_type = values.data_type(); + + // track consecutive Date32 values that fail with datetime formats as a single range. + // this allows us to perform just one batch cast to Date64 instead of individual casts. + let mut date_retry_range = None; + + for idx in 0..values.len() { + if let Some((start_offset, length)) = date_retry_range { + let value_slice = cast(values.slice(start_offset, length).as_ref(), &Date64) + .map_err(|e| Err(exec_datafusion_err!("{}", e)))?; + + results.extend(to_char_array_inner( + value_slice, + &format_array.slice(start_offset, length), + )?); + + date_retry_range = None; + } + let format = if format_array.is_null(idx) { None } else { @@ -275,38 +295,57 @@ fn to_char_array(args: &[ColumnarValue]) -> Result { results.push(None); continue; } - let format_options = match build_format_options(data_type, format) { - Ok(value) => value, - Err(value) => return value, - }; + let format_options = build_format_options(data_type, format)?; + // this isn't ideal but this can't use ValueFormatter as it isn't independent // from ArrayFormatter - let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; + let formatter = ArrayFormatter::try_new(values.as_ref(), &format_options) + .map_err(|e| Err(exec_datafusion_err!("{}", e)))?; + let result = formatter.value(idx).try_to_string(); match result { Ok(value) => results.push(Some(value)), Err(e) => { - // if the data type was a Date32, formatting could have failed because the format string - // contained datetime specifiers, so we'll treat this specific date element as a timestamp + // when a Date32 fails formatting due to datetime specifiers in the format string: + // - if we already have a retry range, extend it by one more element + // - if this is the first failure, start a new retry range at this index + // we'll later convert this entire range to Date64 timestamps in a single batch operation if data_type == &Date32 { - let failed_date_value = arrays[0].slice(idx, 1); - - match retry_date_as_timestamp(failed_date_value, &format_options) { - Ok(value) => { - results.push(Some(value)); - continue; - } - Err(e) => { - return exec_err!("{}", e); - } - } + date_retry_range = match date_retry_range { + Some((offset, length)) => Some((offset, length + 1)), + None => Some((idx, 1)), + }; + + continue; } - return exec_err!("{}", e); + return Err(exec_err!("{}", e)); } } } + if let Some((start_offset, length)) = date_retry_range { + let value_slice = cast(values.slice(start_offset, length).as_ref(), &Date64) + .map_err(|e| exec_err!("{}", e))?; + + results.extend(to_char_array_inner( + value_slice, + &format_array.slice(start_offset, length), + )?); + } + + Ok(results) +} + +fn to_char_array(args: &[ColumnarValue]) -> Result { + let arrays = ColumnarValue::values_to_arrays(args)?; + let format_array = arrays[1].as_string::(); + let results = match to_char_array_inner(Arc::clone(&arrays[0]), format_array) { + Ok(results) => results, + Err(Ok(columnar_value)) => return Ok(columnar_value), + Err(Err(e)) => return exec_err!("{}", e), + }; + match args[0] { ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from( results, @@ -320,19 +359,6 @@ fn to_char_array(args: &[ColumnarValue]) -> Result { } } -fn retry_date_as_timestamp( - array_ref: ArrayRef, - format_options: &FormatOptions, -) -> Result { - let target_data_type = Date64; - - let date_value = cast(&array_ref, &target_data_type)?; - let formatter = ArrayFormatter::try_new(date_value.as_ref(), format_options)?; - let result = formatter.value(0).try_to_string()?; - - Ok(result) -} - #[cfg(test)] mod tests { use crate::datetime::to_char::ToCharFunc; @@ -348,37 +374,6 @@ mod tests { use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; use std::sync::Arc; - #[test] - fn test_array_array() { - let array_array_data = vec![( - Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, - StringArray::from(vec!["%Y::%m::%d", "%Y::%m::%d %S::%M::%H %f"]), - StringArray::from(vec!["2020::09::01", "2020::09::02 00::00::00 000000000"]), - )]; - - for (value, format, expected) in array_array_data { - let batch_len = value.len(); - let args = datafusion_expr::ScalarFunctionArgs { - args: vec![ - ColumnarValue::Array(value), - ColumnarValue::Array(Arc::new(format) as ArrayRef), - ], - number_rows: batch_len, - return_type: &DataType::Utf8, - }; - let result = ToCharFunc::new() - .invoke_with_args(args) - .expect("that to_char parsed values without error"); - - if let ColumnarValue::Array(result) = result { - assert_eq!(result.len(), 2); - assert_eq!(&expected as &dyn Array, result.as_ref()); - } else { - panic!("Expected an array value") - } - } - } - #[test] fn test_to_char() { let date = "2020-01-02T03:04:05" From ed961cb63d9032485effa4b14fe4016724053d90 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 10 Aug 2025 21:50:46 +0000 Subject: [PATCH 09/14] Revert "Keep track of consecutive Date32 failures" This reverts commit b379fe1b2927fc5c275670c70721df947a433c13. --- datafusion/functions/src/datetime/to_char.rs | 135 ++++++++++--------- 1 file changed, 70 insertions(+), 65 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 8715983934b3..7836e04c0c4d 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::sync::Arc; use arrow::array::cast::AsArray; -use arrow::array::{new_null_array, Array, ArrayRef, GenericStringArray, StringArray}; +use arrow::array::{new_null_array, Array, ArrayRef, StringArray}; use arrow::compute::cast; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{ @@ -28,9 +28,7 @@ use arrow::datatypes::DataType::{ use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::error::ArrowError; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; -use datafusion_common::{ - exec_datafusion_err, exec_err, utils::take_function_args, Result, ScalarValue, -}; +use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD, @@ -261,31 +259,13 @@ fn to_char_scalar( } } -fn to_char_array_inner( - values: ArrayRef, - format_array: &GenericStringArray, -) -> Result>, Result> { +fn to_char_array(args: &[ColumnarValue]) -> Result { + let arrays = ColumnarValue::values_to_arrays(args)?; let mut results: Vec> = vec![]; + let format_array = arrays[1].as_string::(); + let data_type = arrays[0].data_type(); - let data_type = values.data_type(); - - // track consecutive Date32 values that fail with datetime formats as a single range. - // this allows us to perform just one batch cast to Date64 instead of individual casts. - let mut date_retry_range = None; - - for idx in 0..values.len() { - if let Some((start_offset, length)) = date_retry_range { - let value_slice = cast(values.slice(start_offset, length).as_ref(), &Date64) - .map_err(|e| Err(exec_datafusion_err!("{}", e)))?; - - results.extend(to_char_array_inner( - value_slice, - &format_array.slice(start_offset, length), - )?); - - date_retry_range = None; - } - + for idx in 0..arrays[0].len() { let format = if format_array.is_null(idx) { None } else { @@ -295,57 +275,38 @@ fn to_char_array_inner( results.push(None); continue; } - let format_options = build_format_options(data_type, format)?; - + let format_options = match build_format_options(data_type, format) { + Ok(value) => value, + Err(value) => return value, + }; // this isn't ideal but this can't use ValueFormatter as it isn't independent // from ArrayFormatter - let formatter = ArrayFormatter::try_new(values.as_ref(), &format_options) - .map_err(|e| Err(exec_datafusion_err!("{}", e)))?; - + let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; let result = formatter.value(idx).try_to_string(); match result { Ok(value) => results.push(Some(value)), Err(e) => { - // when a Date32 fails formatting due to datetime specifiers in the format string: - // - if we already have a retry range, extend it by one more element - // - if this is the first failure, start a new retry range at this index - // we'll later convert this entire range to Date64 timestamps in a single batch operation + // if the data type was a Date32, formatting could have failed because the format string + // contained datetime specifiers, so we'll treat this specific date element as a timestamp if data_type == &Date32 { - date_retry_range = match date_retry_range { - Some((offset, length)) => Some((offset, length + 1)), - None => Some((idx, 1)), - }; - - continue; + let failed_date_value = arrays[0].slice(idx, 1); + + match retry_date_as_timestamp(failed_date_value, &format_options) { + Ok(value) => { + results.push(Some(value)); + continue; + } + Err(e) => { + return exec_err!("{}", e); + } + } } - return Err(exec_err!("{}", e)); + return exec_err!("{}", e); } } } - if let Some((start_offset, length)) = date_retry_range { - let value_slice = cast(values.slice(start_offset, length).as_ref(), &Date64) - .map_err(|e| exec_err!("{}", e))?; - - results.extend(to_char_array_inner( - value_slice, - &format_array.slice(start_offset, length), - )?); - } - - Ok(results) -} - -fn to_char_array(args: &[ColumnarValue]) -> Result { - let arrays = ColumnarValue::values_to_arrays(args)?; - let format_array = arrays[1].as_string::(); - let results = match to_char_array_inner(Arc::clone(&arrays[0]), format_array) { - Ok(results) => results, - Err(Ok(columnar_value)) => return Ok(columnar_value), - Err(Err(e)) => return exec_err!("{}", e), - }; - match args[0] { ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from( results, @@ -359,6 +320,19 @@ fn to_char_array(args: &[ColumnarValue]) -> Result { } } +fn retry_date_as_timestamp( + array_ref: ArrayRef, + format_options: &FormatOptions, +) -> Result { + let target_data_type = Date64; + + let date_value = cast(&array_ref, &target_data_type)?; + let formatter = ArrayFormatter::try_new(date_value.as_ref(), format_options)?; + let result = formatter.value(0).try_to_string()?; + + Ok(result) +} + #[cfg(test)] mod tests { use crate::datetime::to_char::ToCharFunc; @@ -374,6 +348,37 @@ mod tests { use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; use std::sync::Arc; + #[test] + fn test_array_array() { + let array_array_data = vec![( + Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, + StringArray::from(vec!["%Y::%m::%d", "%Y::%m::%d %S::%M::%H %f"]), + StringArray::from(vec!["2020::09::01", "2020::09::02 00::00::00 000000000"]), + )]; + + for (value, format, expected) in array_array_data { + let batch_len = value.len(); + let args = datafusion_expr::ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(value), + ColumnarValue::Array(Arc::new(format) as ArrayRef), + ], + number_rows: batch_len, + return_type: &DataType::Utf8, + }; + let result = ToCharFunc::new() + .invoke_with_args(args) + .expect("that to_char parsed values without error"); + + if let ColumnarValue::Array(result) = result { + assert_eq!(result.len(), 2); + assert_eq!(&expected as &dyn Array, result.as_ref()); + } else { + panic!("Expected an array value") + } + } + } + #[test] fn test_to_char() { let date = "2020-01-02T03:04:05" From 6c8d1ebb13c0732a474dddefcd32a8db6ee319ca Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 10 Aug 2025 22:01:09 +0000 Subject: [PATCH 10/14] Updated example filename. --- datafusion/functions/src/datetime/to_char.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 7836e04c0c4d..74183ab5268d 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -48,7 +48,7 @@ use datafusion_macros::user_doc; +----------------------------------------------+ ``` -Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_char.rs) +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs) "#, argument( name = "expression", From 22cb09a7a09e56463d3f1923c6ab20f92d0fe434 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 10 Aug 2025 23:08:26 +0000 Subject: [PATCH 11/14] Updates to reflect changes in main. --- datafusion/functions/benches/to_char.rs | 31 +++++++++++++++----- datafusion/functions/src/datetime/to_char.rs | 10 ++++++- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/datafusion/functions/benches/to_char.rs b/datafusion/functions/benches/to_char.rs index 064f4c01511c..f1151b6db2e9 100644 --- a/datafusion/functions/benches/to_char.rs +++ b/datafusion/functions/benches/to_char.rs @@ -93,7 +93,7 @@ fn pick_date_time_pattern(rng: &mut ThreadRng) -> String { } fn pick_date_and_date_time_mixed_pattern(rng: &mut ThreadRng) -> String { - match rng.gen_bool(0.5) { + match rng.random_bool(0.5) { true => pick_date_pattern(rng), false => pick_date_time_pattern(rng), } @@ -168,8 +168,13 @@ fn criterion_benchmark(c: &mut Criterion) { to_char() .invoke_with_args(ScalarFunctionArgs { args: vec![data.clone(), patterns.clone()], + arg_fields: vec![ + Field::new("a", data.data_type(), true).into(), + Field::new("b", patterns.data_type(), true).into(), + ], number_rows: batch_len, - return_type: &DataType::Utf8, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), }) .expect("to_char should work on valid values"), ) @@ -177,7 +182,7 @@ fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("to_char_array_mixed_patterns_1000", |b| { - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); let data_arr = generate_date32_array(&mut rng); let batch_len = data_arr.len(); let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); @@ -190,8 +195,13 @@ fn criterion_benchmark(c: &mut Criterion) { to_char() .invoke_with_args(ScalarFunctionArgs { args: vec![data.clone(), patterns.clone()], + arg_fields: vec![ + Field::new("a", data.data_type(), true).into(), + Field::new("b", patterns.data_type(), true).into(), + ], number_rows: batch_len, - return_type: &DataType::Utf8, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), }) .expect("to_char should work on valid values"), ) @@ -199,7 +209,7 @@ fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("to_char_scalar_date_only_pattern_1000", |b| { - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); let data_arr = generate_date32_array(&mut rng); let batch_len = data_arr.len(); let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); @@ -211,8 +221,13 @@ fn criterion_benchmark(c: &mut Criterion) { to_char() .invoke_with_args(ScalarFunctionArgs { args: vec![data.clone(), patterns.clone()], + arg_fields: vec![ + Field::new("a", data.data_type(), true).into(), + Field::new("b", patterns.data_type(), true).into(), + ], number_rows: batch_len, - return_type: &DataType::Utf8, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), }) .expect("to_char should work on valid values"), ) @@ -220,7 +235,7 @@ fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("to_char_scalar_datetime_pattern_1000", |b| { - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); let data_arr = generate_date32_array(&mut rng); let batch_len = data_arr.len(); let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); @@ -247,7 +262,7 @@ fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("to_char_scalar_1000", |b| { - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); let timestamp = "2026-07-08T09:10:11" .parse::() .unwrap() diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 19bf3206c86c..0c13af1c3938 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -360,13 +360,21 @@ mod tests { for (value, format, expected) in array_array_data { let batch_len = value.len(); + let value_data_type = value.data_type().clone(); + let format_data_type = format.data_type().clone(); + let args = datafusion_expr::ScalarFunctionArgs { args: vec![ ColumnarValue::Array(value), ColumnarValue::Array(Arc::new(format) as ArrayRef), ], + arg_fields: vec![ + Field::new("a", value_data_type, true).into(), + Field::new("b", format_data_type, true).into(), + ], number_rows: batch_len, - return_type: &DataType::Utf8, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&Arc::new(ConfigOptions::default())), }; let result = ToCharFunc::new() .invoke_with_args(args) From 50da84579a8119874efba35a77dcb8ec6e4e2629 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 10 Aug 2025 23:33:25 +0000 Subject: [PATCH 12/14] Documentation update. --- docs/source/user-guide/sql/scalar_functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index d49fc22dabb4..06594ad9a5ed 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2263,7 +2263,7 @@ to_char(expression, format) +----------------------------------------------+ ``` -Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_char.rs) +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs) #### Aliases From a301e9b7177ff435cd22d7c706f184e164068edb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 12 Aug 2025 11:49:04 -0400 Subject: [PATCH 13/14] Optimize to_char --- datafusion/functions/src/datetime/to_char.rs | 30 +++++++++++--------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 0c13af1c3938..7a244b6d76a3 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -139,20 +139,23 @@ impl ScalarUDFImpl for ToCharFunc { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let num_rows = args.number_rows; let args = args.args; - let [date_time, format] = take_function_args(self.name(), &args)?; + let [date_time, format] = take_function_args(self.name(), args)?; match format { ColumnarValue::Scalar(ScalarValue::Utf8(None)) | ColumnarValue::Scalar(ScalarValue::Null) => { - to_char_scalar(date_time.clone(), None) + to_char_scalar(date_time, None) } // constant format ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => { // invoke to_char_scalar with the known string, without converting to array - to_char_scalar(date_time.clone(), Some(format)) + to_char_scalar(date_time, Some(&format)) } - ColumnarValue::Array(_) => to_char_array(&args), + ColumnarValue::Array(_) => to_char_array( + date_time, format.to_array(num_rows)? + ), _ => { exec_err!( "Format for `to_char` must be non-null Utf8, received {:?}", @@ -253,20 +256,21 @@ fn to_char_scalar( // if the data type was a Date32, formatting could have failed because the format string // contained datetime specifiers, so we'll retry by casting the date array as a timestamp array if data_type == &Date32 { - return to_char_scalar(expression.clone().cast_to(&Date64, None)?, format); + return to_char_scalar(expression.cast_to(&Date64, None)?, format); } exec_err!("{}", formatted.unwrap_err()) } } -fn to_char_array(args: &[ColumnarValue]) -> Result { - let arrays = ColumnarValue::values_to_arrays(args)?; +fn to_char_array(date_time: ColumnarValue, format_array: ArrayRef) -> Result { let mut results: Vec> = vec![]; - let format_array = arrays[1].as_string::(); - let data_type = arrays[0].data_type(); + let format_array = format_array.as_string::(); + let num_rows = format_array.len(); + let date_time_array = date_time.to_array(num_rows)?; + let data_type = date_time_array.data_type(); - for idx in 0..arrays[0].len() { + for idx in 0..num_rows { let format = if format_array.is_null(idx) { None } else { @@ -282,7 +286,7 @@ fn to_char_array(args: &[ColumnarValue]) -> Result { }; // this isn't ideal but this can't use ValueFormatter as it isn't independent // from ArrayFormatter - let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; + let formatter = ArrayFormatter::try_new(date_time_array.as_ref(), &format_options)?; let result = formatter.value(idx).try_to_string(); match result { Ok(value) => results.push(Some(value)), @@ -290,7 +294,7 @@ fn to_char_array(args: &[ColumnarValue]) -> Result { // if the data type was a Date32, formatting could have failed because the format string // contained datetime specifiers, so we'll treat this specific date element as a timestamp if data_type == &Date32 { - let failed_date_value = arrays[0].slice(idx, 1); + let failed_date_value = date_time_array.slice(idx, 1); match retry_date_as_timestamp(failed_date_value, &format_options) { Ok(value) => { @@ -308,7 +312,7 @@ fn to_char_array(args: &[ColumnarValue]) -> Result { } } - match args[0] { + match date_time { ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from( results, )) as ArrayRef)), From 43f70b7b1d143ae06a0e8b3c8a9c0d05e302ef5e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 12 Aug 2025 12:37:20 -0400 Subject: [PATCH 14/14] fmt --- datafusion/functions/src/datetime/to_char.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 7a244b6d76a3..bad8422206e7 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -145,17 +145,15 @@ impl ScalarUDFImpl for ToCharFunc { match format { ColumnarValue::Scalar(ScalarValue::Utf8(None)) - | ColumnarValue::Scalar(ScalarValue::Null) => { - to_char_scalar(date_time, None) - } + | ColumnarValue::Scalar(ScalarValue::Null) => to_char_scalar(date_time, None), // constant format ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => { // invoke to_char_scalar with the known string, without converting to array to_char_scalar(date_time, Some(&format)) } - ColumnarValue::Array(_) => to_char_array( - date_time, format.to_array(num_rows)? - ), + ColumnarValue::Array(_) => { + to_char_array(date_time, format.to_array(num_rows)?) + } _ => { exec_err!( "Format for `to_char` must be non-null Utf8, received {:?}", @@ -263,7 +261,10 @@ fn to_char_scalar( } } -fn to_char_array(date_time: ColumnarValue, format_array: ArrayRef) -> Result { +fn to_char_array( + date_time: ColumnarValue, + format_array: ArrayRef, +) -> Result { let mut results: Vec> = vec![]; let format_array = format_array.as_string::(); let num_rows = format_array.len(); @@ -286,7 +287,8 @@ fn to_char_array(date_time: ColumnarValue, format_array: ArrayRef) -> Result results.push(Some(value)),