diff --git a/datafusion/functions/benches/to_char.rs b/datafusion/functions/benches/to_char.rs index 0f4cc264cbe0..f1151b6db2e9 100644 --- a/datafusion/functions/benches/to_char.rs +++ b/datafusion/functions/benches/to_char.rs @@ -33,7 +33,7 @@ use rand::prelude::IndexedRandom; use rand::rngs::ThreadRng; use rand::Rng; -fn random_date_in_range( +fn pick_date_in_range( rng: &mut ThreadRng, start_date: NaiveDate, end_date: NaiveDate, @@ -43,7 +43,7 @@ fn random_date_in_range( start_date + TimeDelta::try_days(random_days).unwrap() } -fn data(rng: &mut ThreadRng) -> Date32Array { +fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array { let mut data: Vec = vec![]; let unix_days_from_ce = NaiveDate::from_ymd_opt(1970, 1, 1) .unwrap() @@ -56,7 +56,7 @@ fn data(rng: &mut ThreadRng) -> Date32Array { .expect("Date should parse"); for _ in 0..1000 { data.push( - random_date_in_range(rng, start_date, end_date).num_days_from_ce() + pick_date_in_range(rng, start_date, end_date).num_days_from_ce() - unix_days_from_ce, ); } @@ -64,31 +64,131 @@ fn data(rng: &mut ThreadRng) -> Date32Array { Date32Array::from(data) } -fn patterns(rng: &mut ThreadRng) -> StringArray { - let samples = [ - "%Y:%m:%d".to_string(), - "%d-%m-%Y".to_string(), - "%d%m%Y".to_string(), - "%Y%m%d".to_string(), - "%Y...%m...%d".to_string(), - ]; - let mut data: Vec = vec![]; +const DATE_PATTERNS: [&str; 5] = + ["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"]; + +const DATETIME_PATTERNS: [&str; 8] = [ + "%Y:%m:%d %H:%M%S", + "%Y:%m:%d %_H:%M%S", + "%Y:%m:%d %k:%M%S", + "%d-%m-%Y %I%P-%M-%S %f", + "%d%m%Y %H", + "%Y%m%d %M-%S %.3f", + "%Y...%m...%d %T%3f", + "%c", +]; + +fn pick_date_pattern(rng: &mut ThreadRng) -> String { + DATE_PATTERNS + .choose(rng) + .expect("Empty list of date patterns") + .to_string() +} + +fn pick_date_time_pattern(rng: &mut ThreadRng) -> String { + DATETIME_PATTERNS + .choose(rng) + .expect("Empty list of date time patterns") + .to_string() +} + +fn pick_date_and_date_time_mixed_pattern(rng: &mut ThreadRng) -> String { + match rng.random_bool(0.5) { + true => pick_date_pattern(rng), + false => pick_date_time_pattern(rng), + } +} + +fn generate_pattern_array( + rng: &mut ThreadRng, + pick_fn: impl Fn(&mut ThreadRng) -> String, +) -> StringArray { + let mut data = Vec::with_capacity(1000); + for _ in 0..1000 { - data.push(samples.choose(rng).unwrap().to_string()); + data.push(pick_fn(rng)); } StringArray::from(data) } +fn generate_date_pattern_array(rng: &mut ThreadRng) -> StringArray { + generate_pattern_array(rng, pick_date_pattern) +} + +fn generate_datetime_pattern_array(rng: &mut ThreadRng) -> StringArray { + generate_pattern_array(rng, pick_date_time_pattern) +} + +fn generate_mixed_pattern_array(rng: &mut ThreadRng) -> StringArray { + generate_pattern_array(rng, pick_date_and_date_time_mixed_pattern) +} + fn criterion_benchmark(c: &mut Criterion) { let config_options = Arc::new(ConfigOptions::default()); - c.bench_function("to_char_array_array_1000", |b| { + c.bench_function("to_char_array_date_only_patterns_1000", |b| { + let mut rng = rand::rng(); + let data_arr = generate_date32_array(&mut rng); + let batch_len = data_arr.len(); + let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); + let patterns = ColumnarValue::Array(Arc::new(generate_date_pattern_array( + &mut rng, + )) as ArrayRef); + + b.iter(|| { + black_box( + to_char() + .invoke_with_args(ScalarFunctionArgs { + args: vec![data.clone(), patterns.clone()], + arg_fields: vec![ + Field::new("a", data.data_type(), true).into(), + Field::new("b", patterns.data_type(), true).into(), + ], + number_rows: batch_len, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .expect("to_char should work on valid values"), + ) + }) + }); + + c.bench_function("to_char_array_datetime_patterns_1000", |b| { + let mut rng = rand::rng(); + let data_arr = generate_date32_array(&mut rng); + let batch_len = data_arr.len(); + let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); + let patterns = ColumnarValue::Array(Arc::new(generate_datetime_pattern_array( + &mut rng, + )) as ArrayRef); + + b.iter(|| { + black_box( + to_char() + .invoke_with_args(ScalarFunctionArgs { + args: vec![data.clone(), patterns.clone()], + arg_fields: vec![ + Field::new("a", data.data_type(), true).into(), + Field::new("b", patterns.data_type(), true).into(), + ], + number_rows: batch_len, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .expect("to_char should work on valid values"), + ) + }) + }); + + c.bench_function("to_char_array_mixed_patterns_1000", |b| { let mut rng = rand::rng(); - let data_arr = data(&mut rng); + let data_arr = generate_date32_array(&mut rng); let batch_len = data_arr.len(); let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); - let patterns = ColumnarValue::Array(Arc::new(patterns(&mut rng)) as ArrayRef); + let patterns = ColumnarValue::Array(Arc::new(generate_mixed_pattern_array( + &mut rng, + )) as ArrayRef); b.iter(|| { black_box( @@ -108,13 +208,13 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("to_char_array_scalar_1000", |b| { + c.bench_function("to_char_scalar_date_only_pattern_1000", |b| { let mut rng = rand::rng(); - let data_arr = data(&mut rng); + let data_arr = generate_date32_array(&mut rng); let batch_len = data_arr.len(); let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); let patterns = - ColumnarValue::Scalar(ScalarValue::Utf8(Some("%Y-%m-%d".to_string()))); + ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng)))); b.iter(|| { black_box( @@ -134,7 +234,35 @@ fn criterion_benchmark(c: &mut Criterion) { }) }); - c.bench_function("to_char_scalar_scalar_1000", |b| { + c.bench_function("to_char_scalar_datetime_pattern_1000", |b| { + let mut rng = rand::rng(); + let data_arr = generate_date32_array(&mut rng); + let batch_len = data_arr.len(); + let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef); + let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some( + pick_date_time_pattern(&mut rng), + ))); + + b.iter(|| { + black_box( + to_char() + .invoke_with_args(ScalarFunctionArgs { + args: vec![data.clone(), patterns.clone()], + arg_fields: vec![ + Field::new("a", data.data_type(), true).into(), + Field::new("b", patterns.data_type(), true).into(), + ], + number_rows: batch_len, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .expect("to_char should work on valid values"), + ) + }) + }); + + c.bench_function("to_char_scalar_1000", |b| { + let mut rng = rand::rng(); let timestamp = "2026-07-08T09:10:11" .parse::() .unwrap() @@ -144,9 +272,8 @@ fn criterion_benchmark(c: &mut Criterion) { .timestamp_nanos_opt() .unwrap(); let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp), None)); - let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some( - "%d-%m-%Y %H:%M:%S".to_string(), - ))); + let pattern = + ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng)))); b.iter(|| { black_box( diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 2f7e5fa56eb1..bad8422206e7 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use arrow::array::cast::AsArray; use arrow::array::{new_null_array, Array, ArrayRef, StringArray}; +use arrow::compute::cast; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{ Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8, @@ -27,7 +28,6 @@ use arrow::datatypes::DataType::{ use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::error::ArrowError; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; - use datafusion_common::{exec_err, utils::take_function_args, Result, ScalarValue}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ @@ -48,7 +48,7 @@ use datafusion_macros::user_doc; +----------------------------------------------+ ``` -Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_char.rs) +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs) "#, argument( name = "expression", @@ -139,20 +139,21 @@ impl ScalarUDFImpl for ToCharFunc { &self, args: datafusion_expr::ScalarFunctionArgs, ) -> Result { + let num_rows = args.number_rows; let args = args.args; - let [date_time, format] = take_function_args(self.name(), &args)?; + let [date_time, format] = take_function_args(self.name(), args)?; match format { ColumnarValue::Scalar(ScalarValue::Utf8(None)) - | ColumnarValue::Scalar(ScalarValue::Null) => { - _to_char_scalar(date_time.clone(), None) - } + | ColumnarValue::Scalar(ScalarValue::Null) => to_char_scalar(date_time, None), // constant format ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => { // invoke to_char_scalar with the known string, without converting to array - _to_char_scalar(date_time.clone(), Some(format)) + to_char_scalar(date_time, Some(&format)) + } + ColumnarValue::Array(_) => { + to_char_array(date_time, format.to_array(num_rows)?) } - ColumnarValue::Array(_) => _to_char_array(&args), _ => { exec_err!( "Format for `to_char` must be non-null Utf8, received {:?}", @@ -171,7 +172,7 @@ impl ScalarUDFImpl for ToCharFunc { } } -fn _build_format_options<'a>( +fn build_format_options<'a>( data_type: &DataType, format: Option<&'a str>, ) -> Result, Result> { @@ -179,7 +180,9 @@ fn _build_format_options<'a>( return Ok(FormatOptions::new()); }; let format_options = match data_type { - Date32 => FormatOptions::new().with_date_format(Some(format)), + Date32 => FormatOptions::new() + .with_date_format(Some(format)) + .with_datetime_format(Some(format)), Date64 => FormatOptions::new().with_datetime_format(Some(format)), Time32(_) => FormatOptions::new().with_time_format(Some(format)), Time64(_) => FormatOptions::new().with_time_format(Some(format)), @@ -203,7 +206,7 @@ fn _build_format_options<'a>( } /// Special version when arg\[1] is a scalar -fn _to_char_scalar( +fn to_char_scalar( expression: ColumnarValue, format: Option<&str>, ) -> Result { @@ -211,17 +214,17 @@ fn _to_char_scalar( // of the implementation in arrow-rs we need to convert it to an array let data_type = &expression.data_type(); let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_)); - let array = expression.into_array(1)?; + let array = expression.clone().into_array(1)?; if format.is_none() { - if is_scalar_expression { - return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))); + return if is_scalar_expression { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))) } else { - return Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len()))); - } + Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len()))) + }; } - let format_options = match _build_format_options(data_type, format) { + let format_options = match build_format_options(data_type, format) { Ok(value) => value, Err(value) => return value, }; @@ -248,17 +251,27 @@ fn _to_char_scalar( )) } } else { + // if the data type was a Date32, formatting could have failed because the format string + // contained datetime specifiers, so we'll retry by casting the date array as a timestamp array + if data_type == &Date32 { + return to_char_scalar(expression.cast_to(&Date64, None)?, format); + } + exec_err!("{}", formatted.unwrap_err()) } } -fn _to_char_array(args: &[ColumnarValue]) -> Result { - let arrays = ColumnarValue::values_to_arrays(args)?; +fn to_char_array( + date_time: ColumnarValue, + format_array: ArrayRef, +) -> Result { let mut results: Vec> = vec![]; - let format_array = arrays[1].as_string::(); - let data_type = arrays[0].data_type(); + let format_array = format_array.as_string::(); + let num_rows = format_array.len(); + let date_time_array = date_time.to_array(num_rows)?; + let data_type = date_time_array.data_type(); - for idx in 0..arrays[0].len() { + for idx in 0..num_rows { let format = if format_array.is_null(idx) { None } else { @@ -268,21 +281,40 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { results.push(None); continue; } - let format_options = match _build_format_options(data_type, format) { + let format_options = match build_format_options(data_type, format) { Ok(value) => value, Err(value) => return value, }; // this isn't ideal but this can't use ValueFormatter as it isn't independent // from ArrayFormatter - let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; + let formatter = + ArrayFormatter::try_new(date_time_array.as_ref(), &format_options)?; let result = formatter.value(idx).try_to_string(); match result { Ok(value) => results.push(Some(value)), - Err(e) => return exec_err!("{}", e), + Err(e) => { + // if the data type was a Date32, formatting could have failed because the format string + // contained datetime specifiers, so we'll treat this specific date element as a timestamp + if data_type == &Date32 { + let failed_date_value = date_time_array.slice(idx, 1); + + match retry_date_as_timestamp(failed_date_value, &format_options) { + Ok(value) => { + results.push(Some(value)); + continue; + } + Err(e) => { + return exec_err!("{}", e); + } + } + } + + return exec_err!("{}", e); + } } } - match args[0] { + match date_time { ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from( results, )) as ArrayRef)), @@ -295,6 +327,19 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { } } +fn retry_date_as_timestamp( + array_ref: ArrayRef, + format_options: &FormatOptions, +) -> Result { + let target_data_type = Date64; + + let date_value = cast(&array_ref, &target_data_type)?; + let formatter = ArrayFormatter::try_new(date_value.as_ref(), format_options)?; + let result = formatter.value(0).try_to_string()?; + + Ok(result) +} + #[cfg(test)] mod tests { use crate::datetime::to_char::ToCharFunc; @@ -311,6 +356,45 @@ mod tests { use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; use std::sync::Arc; + #[test] + fn test_array_array() { + let array_array_data = vec![( + Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, + StringArray::from(vec!["%Y::%m::%d", "%Y::%m::%d %S::%M::%H %f"]), + StringArray::from(vec!["2020::09::01", "2020::09::02 00::00::00 000000000"]), + )]; + + for (value, format, expected) in array_array_data { + let batch_len = value.len(); + let value_data_type = value.data_type().clone(); + let format_data_type = format.data_type().clone(); + + let args = datafusion_expr::ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(value), + ColumnarValue::Array(Arc::new(format) as ArrayRef), + ], + arg_fields: vec![ + Field::new("a", value_data_type, true).into(), + Field::new("b", format_data_type, true).into(), + ], + number_rows: batch_len, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&Arc::new(ConfigOptions::default())), + }; + let result = ToCharFunc::new() + .invoke_with_args(args) + .expect("that to_char parsed values without error"); + + if let ColumnarValue::Array(result) = result { + assert_eq!(result.len(), 2); + assert_eq!(&expected as &dyn Array, result.as_ref()); + } else { + panic!("Expected an array value") + } + } + } + #[test] fn test_to_char() { let date = "2020-01-02T03:04:05" @@ -330,6 +414,11 @@ mod tests { ScalarValue::Utf8(Some("%Y::%m::%d".to_string())), "2020::09::01".to_string(), ), + ( + ScalarValue::Date32(Some(18506)), + ScalarValue::Utf8(Some("%Y::%m::%d %S::%M::%H %f".to_string())), + "2020::09::01 00::00::00 000000000".to_string(), + ), ( ScalarValue::Date64(Some(date.and_utc().timestamp_millis())), ScalarValue::Utf8(Some("%Y::%m::%d".to_string())), @@ -415,6 +504,11 @@ mod tests { StringArray::from(vec!["%Y::%m::%d".to_string()]), "2020::09::01".to_string(), ), + ( + ScalarValue::Date32(Some(18506)), + StringArray::from(vec!["%Y::%m::%d %S::%M::%H %f".to_string()]), + "2020::09::01 00::00::00 000000000".to_string(), + ), ( ScalarValue::Date64(Some(date.and_utc().timestamp_millis())), StringArray::from(vec!["%Y::%m::%d".to_string()]), @@ -504,6 +598,14 @@ mod tests { ScalarValue::Utf8(Some("%Y::%m::%d".to_string())), StringArray::from(vec!["2020::09::01", "2020::09::02"]), ), + ( + Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, + ScalarValue::Utf8(Some("%Y::%m::%d %S::%M::%H %f".to_string())), + StringArray::from(vec![ + "2020::09::01 00::00::00 000000000", + "2020::09::02 00::00::00 000000000", + ]), + ), ( Arc::new(Date64Array::from(vec![ date.and_utc().timestamp_millis(), @@ -520,6 +622,25 @@ mod tests { StringArray::from(vec!["%Y::%m::%d", "%d::%m::%Y"]), StringArray::from(vec!["2020::09::01", "02::09::2020"]), ), + ( + Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, + StringArray::from(vec![ + "%Y::%m::%d %S::%M::%H %f", + "%Y::%m::%d %S::%M::%H %f", + ]), + StringArray::from(vec![ + "2020::09::01 00::00::00 000000000", + "2020::09::02 00::00::00 000000000", + ]), + ), + ( + Arc::new(Date32Array::from(vec![18506, 18507])) as ArrayRef, + StringArray::from(vec!["%Y::%m::%d", "%Y::%m::%d %S::%M::%H %f"]), + StringArray::from(vec![ + "2020::09::01", + "2020::09::02 00::00::00 000000000", + ]), + ), ( Arc::new(Date64Array::from(vec![ date.and_utc().timestamp_millis(), diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index bff955d528ef..f48994431d54 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -2903,6 +2903,18 @@ select date_format(dates, date_format) from formats; 01:01:2000 05:04:2003 +query T +select date_format(dates, time_format) from formats; +---- +00-00-00 +00::00::00 + +query T +select date_format(dates, timestamp_format) from formats; +---- +01:01:2000 00-00-00 +05:04:2003 00-00-00 + query T select to_char(times, time_format) from formats; ---- diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 6de2c411b0d2..04266457c5b9 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2264,7 +2264,7 @@ to_char(expression, format) +----------------------------------------------+ ``` -Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_char.rs) +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/date_time_functions.rs) #### Aliases