From 93bfdc25a2dcec378e22a4d1e268ef2df07e8c60 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Fri, 30 Jan 2026 15:14:24 +0530 Subject: [PATCH 1/2] perf: Optimize scalar path for chr function --- datafusion/functions/benches/chr.rs | 31 ++++++++++++++--- datafusion/functions/src/string/chr.rs | 46 ++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/benches/chr.rs b/datafusion/functions/benches/chr.rs index 9a6342ca40bb6..fcb72771acc42 100644 --- a/datafusion/functions/benches/chr.rs +++ b/datafusion/functions/benches/chr.rs @@ -19,6 +19,7 @@ extern crate criterion; use arrow::{array::PrimitiveArray, datatypes::Int64Type}; use criterion::{Criterion, criterion_group, criterion_main}; +use datafusion_common::ScalarValue; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::string::chr; use rand::{Rng, SeedableRng}; @@ -35,11 +36,32 @@ pub fn seedable_rng() -> StdRng { } fn criterion_benchmark(c: &mut Criterion) { - let cot_fn = chr(); + let chr_fn = chr(); + let config_options = Arc::new(ConfigOptions::default()); + + // Scalar benchmarks + c.bench_function("chr/scalar", |b| { + let args = vec![ColumnarValue::Scalar(ScalarValue::Int64(Some(65)))]; + let arg_fields = vec![Field::new("arg_0", DataType::Int64, true).into()]; + b.iter(|| { + black_box( + chr_fn + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: 1, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .unwrap(), + ) + }) + }); + let size = 1024; let input: PrimitiveArray = { let null_density = 0.2; - let mut rng = StdRng::seed_from_u64(42); + let mut rng = seedable_rng(); (0..size) .map(|_| { if rng.random::() < null_density { @@ -57,12 +79,11 @@ fn criterion_benchmark(c: &mut Criterion) { .enumerate() .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into()) .collect::>(); - let config_options = Arc::new(ConfigOptions::default()); - c.bench_function("chr", |b| { + c.bench_function("chr/array", |b| { b.iter(|| { black_box( - cot_fn + chr_fn .invoke_with_args(ScalarFunctionArgs { args: args.clone(), arg_fields: arg_fields.clone(), diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs index ba011b94367e3..a8bfe67fd21f4 100644 --- a/datafusion/functions/src/string/chr.rs +++ b/datafusion/functions/src/string/chr.rs @@ -24,9 +24,9 @@ use arrow::datatypes::DataType; use arrow::datatypes::DataType::Int64; use arrow::datatypes::DataType::Utf8; -use crate::utils::make_scalar_function; use datafusion_common::cast::as_int64_array; -use datafusion_common::{Result, exec_err}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, ScalarValue, exec_err, internal_err}; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_macros::user_doc; @@ -119,7 +119,47 @@ impl ScalarUDFImpl for ChrFunc { } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - make_scalar_function(chr, vec![])(&args.args) + let return_type = args.return_field.data_type(); + let [arg] = take_function_args(self.name(), args.args)?; + + match arg { + ColumnarValue::Scalar(scalar) => { + if scalar.is_null() { + return Ok(ColumnarValue::Scalar(ScalarValue::try_from( + return_type, + )?)); + } + + let code_point = match scalar { + ScalarValue::Int64(Some(v)) => v, + _ => { + return internal_err!( + "Unexpected data type {:?} for function chr", + scalar.data_type() + ); + } + }; + + if let Ok(u) = u32::try_from(code_point) + && let Some(c) = core::char::from_u32(u) + { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some( + c.to_string(), + )))) + } else { + exec_err!("invalid Unicode scalar value: {code_point}") + } + } + ColumnarValue::Array(array) => { + if !matches!(array.data_type(), Int64) { + return internal_err!( + "Unexpected data type {:?} for function chr", + array.data_type() + ); + } + Ok(ColumnarValue::Array(chr(&[array])?)) + } + } } fn documentation(&self) -> Option<&Documentation> { From c3b33ca6700521381e9c29639726ef5338cefab9 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Sat, 31 Jan 2026 12:28:16 +0530 Subject: [PATCH 2/2] refactor for match and add slt --- datafusion/functions/src/string/chr.rs | 130 +++++++++++--------- datafusion/sqllogictest/test_files/expr.slt | 10 ++ 2 files changed, 83 insertions(+), 57 deletions(-) diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs index a8bfe67fd21f4..2f432c838e010 100644 --- a/datafusion/functions/src/string/chr.rs +++ b/datafusion/functions/src/string/chr.rs @@ -18,8 +18,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::ArrayRef; -use arrow::array::GenericStringBuilder; +use arrow::array::{ArrayRef, GenericStringBuilder, Int64Array}; use arrow::datatypes::DataType; use arrow::datatypes::DataType::Int64; use arrow::datatypes::DataType::Utf8; @@ -33,9 +32,7 @@ use datafusion_macros::user_doc; /// Returns the character with the given code. /// chr(65) = 'A' -fn chr(args: &[ArrayRef]) -> Result { - let integer_array = as_int64_array(&args[0])?; - +fn chr_array(integer_array: &Int64Array) -> Result { let mut builder = GenericStringBuilder::::with_capacity( integer_array.len(), // 1 byte per character, assuming that is the common case @@ -56,15 +53,11 @@ fn chr(args: &[ArrayRef]) -> Result { return exec_err!("invalid Unicode scalar value: {integer}"); } - None => { - builder.append_null(); - } + None => builder.append_null(), } } - let result = builder.finish(); - - Ok(Arc::new(result) as ArrayRef) + Ok(Arc::new(builder.finish()) as ArrayRef) } #[user_doc( @@ -119,27 +112,10 @@ impl ScalarUDFImpl for ChrFunc { } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - let return_type = args.return_field.data_type(); let [arg] = take_function_args(self.name(), args.args)?; match arg { - ColumnarValue::Scalar(scalar) => { - if scalar.is_null() { - return Ok(ColumnarValue::Scalar(ScalarValue::try_from( - return_type, - )?)); - } - - let code_point = match scalar { - ScalarValue::Int64(Some(v)) => v, - _ => { - return internal_err!( - "Unexpected data type {:?} for function chr", - scalar.data_type() - ); - } - }; - + ColumnarValue::Scalar(ScalarValue::Int64(Some(code_point))) => { if let Ok(u) = u32::try_from(code_point) && let Some(c) = core::char::from_u32(u) { @@ -150,15 +126,17 @@ impl ScalarUDFImpl for ChrFunc { exec_err!("invalid Unicode scalar value: {code_point}") } } + ColumnarValue::Scalar(ScalarValue::Int64(None)) => { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))) + } ColumnarValue::Array(array) => { - if !matches!(array.data_type(), Int64) { - return internal_err!( - "Unexpected data type {:?} for function chr", - array.data_type() - ); - } - Ok(ColumnarValue::Array(chr(&[array])?)) + let integer_array = as_int64_array(&array)?; + Ok(ColumnarValue::Array(chr_array(integer_array)?)) } + other => internal_err!( + "Unexpected data type {:?} for function chr", + other.data_type() + ), } } @@ -170,13 +148,27 @@ impl ScalarUDFImpl for ChrFunc { #[cfg(test)] mod tests { use super::*; + use arrow::array::{Array, Int64Array, StringArray}; + use arrow::datatypes::Field; use datafusion_common::assert_contains; + use datafusion_common::config::ConfigOptions; + use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; + + fn invoke_chr(arg: ColumnarValue, number_rows: usize) -> Result { + ChrFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![arg], + arg_fields: vec![Field::new("a", Int64, true).into()], + number_rows, + return_field: Field::new("f", Utf8, true).into(), + config_options: Arc::new(ConfigOptions::default()), + }) + } #[test] fn test_chr_normal() { let input = Arc::new(Int64Array::from(vec![ - Some(0), // null + Some(0), // \u{0000} Some(65), // A Some(66), // B Some(67), // C @@ -189,8 +181,13 @@ mod tests { Some(9), // tab Some(0x10FFFF), // 0x10FFFF, the largest Unicode code point ])); - let result = chr(&[input]).unwrap(); - let string_array = result.as_any().downcast_ref::().unwrap(); + + let result = invoke_chr(ColumnarValue::Array(input), 12).unwrap(); + let ColumnarValue::Array(arr) = result else { + panic!("Expected array"); + }; + let string_array = arr.as_any().downcast_ref::().unwrap(); + let expected = [ "\u{0000}", "A", @@ -214,55 +211,48 @@ mod tests { #[test] fn test_chr_error() { - // invalid Unicode code points (too large) let input = Arc::new(Int64Array::from(vec![i64::MAX])); - let result = chr(&[input]); + let result = invoke_chr(ColumnarValue::Array(input), 1); assert!(result.is_err()); assert_contains!( result.err().unwrap().to_string(), "invalid Unicode scalar value: 9223372036854775807" ); - // invalid Unicode code points (too large) case 2 let input = Arc::new(Int64Array::from(vec![0x10FFFF + 1])); - let result = chr(&[input]); + let result = invoke_chr(ColumnarValue::Array(input), 1); assert!(result.is_err()); assert_contains!( result.err().unwrap().to_string(), "invalid Unicode scalar value: 1114112" ); - // invalid Unicode code points (surrogate code point) - // link: let input = Arc::new(Int64Array::from(vec![0xD800 + 1])); - let result = chr(&[input]); + let result = invoke_chr(ColumnarValue::Array(input), 1); assert!(result.is_err()); assert_contains!( result.err().unwrap().to_string(), "invalid Unicode scalar value: 55297" ); - // negative input - let input = Arc::new(Int64Array::from(vec![i64::MIN + 2i64])); // will be 2 if cast to u32 - let result = chr(&[input]); + let input = Arc::new(Int64Array::from(vec![i64::MIN + 2i64])); + let result = invoke_chr(ColumnarValue::Array(input), 1); assert!(result.is_err()); assert_contains!( result.err().unwrap().to_string(), "invalid Unicode scalar value: -9223372036854775806" ); - // negative input case 2 let input = Arc::new(Int64Array::from(vec![-1])); - let result = chr(&[input]); + let result = invoke_chr(ColumnarValue::Array(input), 1); assert!(result.is_err()); assert_contains!( result.err().unwrap().to_string(), "invalid Unicode scalar value: -1" ); - // one error with valid values after - let input = Arc::new(Int64Array::from(vec![65, -1, 66])); // A, -1, B - let result = chr(&[input]); + let input = Arc::new(Int64Array::from(vec![65, -1, 66])); + let result = invoke_chr(ColumnarValue::Array(input), 3); assert!(result.is_err()); assert_contains!( result.err().unwrap().to_string(), @@ -272,10 +262,36 @@ mod tests { #[test] fn test_chr_empty() { - // empty input array let input = Arc::new(Int64Array::from(Vec::::new())); - let result = chr(&[input]).unwrap(); - let string_array = result.as_any().downcast_ref::().unwrap(); + let result = invoke_chr(ColumnarValue::Array(input), 0).unwrap(); + let ColumnarValue::Array(arr) = result else { + panic!("Expected array"); + }; + let string_array = arr.as_any().downcast_ref::().unwrap(); assert_eq!(string_array.len(), 0); } + + #[test] + fn test_chr_scalar() { + let result = + invoke_chr(ColumnarValue::Scalar(ScalarValue::Int64(Some(65))), 1).unwrap(); + + match result { + ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => { + assert_eq!(s, "A"); + } + other => panic!("Unexpected result: {other:?}"), + } + } + + #[test] + fn test_chr_scalar_null() { + let result = + invoke_chr(ColumnarValue::Scalar(ScalarValue::Int64(None)), 1).unwrap(); + + match result { + ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {} + other => panic!("Unexpected result: {other:?}"), + } + } } diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 00b581632748c..90fe05815fbff 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -432,6 +432,16 @@ SELECT chr(CAST(0 AS int)) statement error DataFusion error: Execution error: invalid Unicode scalar value: 9223372036854775807 SELECT chr(CAST(9223372036854775807 AS bigint)) +statement error DataFusion error: Execution error: invalid Unicode scalar value: 1114112 +SELECT chr(CAST(1114112 AS bigint)) + +statement error DataFusion error: Execution error: invalid Unicode scalar value: -1 +SELECT chr(CAST(-1 AS bigint)) + +# surrogate code point (invalid scalar value) +statement error DataFusion error: Execution error: invalid Unicode scalar value: 55297 +SELECT chr(CAST(55297 AS bigint)) + query T SELECT concat('a','b','c') ----