fix: spark crc32 custom nullability (#19271)

watanaberin · web-flow · commit 9fe9ec744bc6 · 2025-12-21T23:11:17.000Z
## Which issue does this PR close?  - Closes #19157 ## Rationale for this change  The `crc32` UDF was using the default return_type implementation which does not preserve nullability information [Spark CRC32](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L213-L240) * Only returns the data type (Int64) * Doesn't consider nullability of inputs * Would always mark output as non-nullable ## What changes are included in this PR?  * Implemented `return_field_from_args`: Creates a field with Int64 type and correctly propagates nullability from input fields and scalar arguments * Updated `return_type`: Now returns an error directing users to use return_field_from_args instead * Added necessary imports: `Field`, `FieldRef`, and `ReturnFieldArgs` to support the new implementation * Added comprehensive nullability tests: Verifies that nullable inputs, non-nullable inputs, and null scalar literals are handled correctly ## Are these changes tested?  * Non-nullable Binary input produces non-nullable Int64 output * Nullable Binary input produces nullable Int64 output * Null scalar literal (e.g., crc32(NULL)) produces nullable Int64 output * Data type is correctly set to Int64 in all cases ## Are there any user-facing changes?   This is a bug fix that corrects schema metadata only, it does not change the actual computation or introduce any breaking changes to the API.
diff --git a/datafusion/spark/src/function/hash/crc32.rs b/datafusion/spark/src/function/hash/crc32.rs
@@ -19,7 +19,7 @@ use std::any::Any;
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, Int64Array};
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field, FieldRef};
 use crc32fast::Hasher;
 use datafusion_common::cast::{
     as_binary_array, as_binary_view_array, as_fixed_size_binary_array,
@@ -29,8 +29,8 @@ use datafusion_common::types::{NativeType, logical_string};
 use datafusion_common::utils::take_function_args;
 use datafusion_common::{Result, internal_err};
 use datafusion_expr::{
-    Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature,
-    TypeSignatureClass, Volatility,
+    Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
+    Signature, TypeSignatureClass, Volatility,
 };
 use datafusion_functions::utils::make_scalar_function;
 
@@ -75,7 +75,16 @@ impl ScalarUDFImpl for SparkCrc32 {
     }
 
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Int64)
+        internal_err!("return_field_from_args should be used instead")
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable())
+            || args
+                .scalar_arguments
+                .iter()
+                .any(|scalar| scalar.is_some_and(|s| s.is_null()));
+        Ok(Arc::new(Field::new(self.name(), DataType::Int64, nullable)))
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
@@ -122,3 +131,43 @@ fn spark_crc32(args: &[ArrayRef]) -> Result<ArrayRef> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion_common::ScalarValue;
+
+    #[test]
+    fn test_crc32_nullability() -> Result<()> {
+        let crc32_func = SparkCrc32::new();
+
+        // non-nullable field should produce non-nullable output
+        let field_not_null = Arc::new(Field::new("data", DataType::Binary, false));
+        let result = crc32_func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: std::slice::from_ref(&field_not_null),
+            scalar_arguments: &[None],
+        })?;
+        assert!(!result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int64);
+
+        // nullable field should produce nullable output
+        let field_nullable = Arc::new(Field::new("data", DataType::Binary, true));
+        let result = crc32_func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[field_nullable],
+            scalar_arguments: &[None],
+        })?;
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int64);
+
+        // null scalar value - user input literal NULL
+        let scalar_null = ScalarValue::Binary(None);
+        let result = crc32_func.return_field_from_args(ReturnFieldArgs {
+            arg_fields: &[field_not_null],
+            scalar_arguments: &[Some(&scalar_null)],
+        })?;
+        assert!(result.is_nullable());
+        assert_eq!(result.data_type(), &DataType::Int64);
+
+        Ok(())
+    }
+}