Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions datafusion/spark/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,7 @@ name = "slice"
[[bench]]
harness = false
name = "substring"

[[bench]]
harness = false
name = "unhex"
148 changes: 148 additions & 0 deletions datafusion/spark/benches/unhex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

extern crate criterion;

use arrow::array::{
Array, LargeStringArray, LargeStringBuilder, StringArray, StringBuilder,
StringViewArray, StringViewBuilder,
};
use arrow::datatypes::{DataType, Field};
use criterion::{Criterion, criterion_group, criterion_main};
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
use datafusion_spark::function::math::unhex::SparkUnhex;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::hint::black_box;
use std::sync::Arc;

fn generate_hex_string_data(size: usize, null_density: f32) -> StringArray {
let mut rng = StdRng::seed_from_u64(42);
let mut builder = StringBuilder::with_capacity(size, 0);
let hex_chars = b"0123456789abcdefABCDEF";

for _ in 0..size {
if rng.random::<f32>() < null_density {
builder.append_null();
} else {
let len = rng.random_range::<usize, _>(2..=100);
let s: String = std::iter::repeat_with(|| {
hex_chars[rng.random_range(0..hex_chars.len())] as char
})
.take(len)
.collect();
builder.append_value(&s);
}
}
builder.finish()
}

fn generate_hex_large_string_data(size: usize, null_density: f32) -> LargeStringArray {
let mut rng = StdRng::seed_from_u64(42);
let mut builder = LargeStringBuilder::with_capacity(size, 0);
let hex_chars = b"0123456789abcdefABCDEF";

for _ in 0..size {
if rng.random::<f32>() < null_density {
builder.append_null();
} else {
let len = rng.random_range::<usize, _>(2..=100);
let s: String = std::iter::repeat_with(|| {
hex_chars[rng.random_range(0..hex_chars.len())] as char
})
.take(len)
.collect();
builder.append_value(&s);
}
}
builder.finish()
}

fn generate_hex_utf8view_data(size: usize, null_density: f32) -> StringViewArray {
let mut rng = StdRng::seed_from_u64(42);
let mut builder = StringViewBuilder::with_capacity(size);
let hex_chars = b"0123456789abcdefABCDEF";

for _ in 0..size {
if rng.random::<f32>() < null_density {
builder.append_null();
} else {
let len = rng.random_range::<usize, _>(2..=100);
let s: String = std::iter::repeat_with(|| {
hex_chars[rng.random_range(0..hex_chars.len())] as char
})
.take(len)
.collect();
builder.append_value(&s);
}
}
builder.finish()
}

fn run_benchmark(c: &mut Criterion, name: &str, size: usize, array: Arc<dyn Array>) {
let unhex_func = SparkUnhex::new();
let args = vec![ColumnarValue::Array(array)];
let arg_fields: Vec<_> = args
.iter()
.enumerate()
.map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
.collect();
let config_options = Arc::new(ConfigOptions::default());

c.bench_function(&format!("{name}/size={size}"), |b| {
b.iter(|| {
black_box(
unhex_func
.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Arc::new(Field::new("f", DataType::Binary, true)),
config_options: Arc::clone(&config_options),
})
.unwrap(),
)
})
});
}

fn criterion_benchmark(c: &mut Criterion) {
let sizes = vec![1024, 4096, 8192];
let null_density = 0.1;

// Benchmark with hex string
for &size in &sizes {
let data = generate_hex_string_data(size, null_density);
run_benchmark(c, "unhex_utf8", size, Arc::new(data));
}

// Benchmark with hex large string
for &size in &sizes {
let data = generate_hex_large_string_data(size, null_density);
run_benchmark(c, "unhex_large_utf8", size, Arc::new(data));
}

// Benchmark with hex Utf8View
for &size in &sizes {
let data = generate_hex_utf8view_data(size, null_density);
run_benchmark(c, "unhex_utf8view", size, Arc::new(data));
}
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
4 changes: 4 additions & 0 deletions datafusion/spark/src/function/math/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pub mod hex;
pub mod modulus;
pub mod rint;
pub mod trigonometry;
pub mod unhex;
pub mod width_bucket;

use datafusion_expr::ScalarUDF;
Expand All @@ -35,6 +36,7 @@ make_udf_function!(hex::SparkHex, hex);
make_udf_function!(modulus::SparkMod, modulus);
make_udf_function!(modulus::SparkPmod, pmod);
make_udf_function!(rint::SparkRint, rint);
make_udf_function!(unhex::SparkUnhex, unhex);
make_udf_function!(width_bucket::SparkWidthBucket, width_bucket);
make_udf_function!(trigonometry::SparkCsc, csc);
make_udf_function!(trigonometry::SparkSec, sec);
Expand All @@ -57,6 +59,7 @@ pub mod expr_fn {
"Returns the double value that is closest in value to the argument and is equal to a mathematical integer.",
arg1
));
export_functions!((unhex, "Converts hexadecimal string to binary.", arg1));
export_functions!((width_bucket, "Returns the bucket number into which the value of this expression would fall after being evaluated.", arg1 arg2 arg3 arg4));
export_functions!((csc, "Returns the cosecant of expr.", arg1));
export_functions!((sec, "Returns the secant of expr.", arg1));
Expand All @@ -71,6 +74,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
modulus(),
pmod(),
rint(),
unhex(),
width_bucket(),
csc(),
sec(),
Expand Down
Loading