From 98f0d9f32322074b01285f405c86df29997634a3 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Thu, 12 Sep 2024 18:52:43 +0200 Subject: [PATCH] [SPARK-49605][SQL] Fix the prompt when `ascendingOrder` is `DataTypeMismatch` in `SortArray` ### What changes were proposed in this pull request? The pr aims to fix the `prompt` when `ascendingOrder` is `DataTypeMismatch` in `SortArray`. ### Why are the changes needed? - Give an example with the following code: ```scala val df = Seq((Array[Int](2, 1, 3), true), (Array.empty[Int], false)).toDF("a", "b") df.selectExpr("sort_array(a, b)").collect() ``` - Before: ```scala scala> val df = Seq((Array[Int](2, 1, 3), true), (Array.empty[Int], false)).toDF("a", "b") val df: org.apache.spark.sql.DataFrame = [a: array, b: boolean] scala> df.selectExpr("sort_array(a, b)").collect() org.apache.spark.sql.catalyst.ExtendedAnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "sort_array(a, b)" due to data type mismatch: The second parameter requires the "BOOLEAN" type, however "b" has the type "BOOLEAN". SQLSTATE: 42K09; line 1 pos 0; 'Project [unresolvedalias(sort_array(a#7, b#8))] +- Project [_1#2 AS a#7, _2#3 AS b#8] +- LocalRelation [_1#2, _2#3] at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.dataTypeMismatch(package.scala:73) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$7(CheckAnalysis.scala:331) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$7$adapted(CheckAnalysis.scala:313) ``` image Obviously, this error message is `incorrect` and `confusing`. Through the following code: https://github.com/apache/spark/blob/8023504e69fdd037dea002e961b960fd9fa662ba/sql/api/src/main/scala/org/apache/spark/sql/functions.scala#L7176-L7195 we found that it actually requires `ascendingOrder` to be `foldable` and the data type to be `BooleanType`. - After: ``` scala> val df = Seq((Array[Int](2, 1, 3), true), (Array.empty[Int], false)).toDF("a", "b") val df: org.apache.spark.sql.DataFrame = [a: array, b: boolean] scala> df.selectExpr("sort_array(a, b)").collect() org.apache.spark.sql.catalyst.ExtendedAnalysisException: [DATATYPE_MISMATCH.NON_FOLDABLE_INPUT] Cannot resolve "sort_array(a, b)" due to data type mismatch: the input `ascendingOrder` should be a foldable "BOOLEAN" expression; however, got "b". SQLSTATE: 42K09; line 1 pos 0; 'Project [unresolvedalias(sort_array(a#7, b#8))] +- Project [_1#2 AS a#7, _2#3 AS b#8] +- LocalRelation [_1#2, _2#3] at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.dataTypeMismatch(package.scala:73) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$7(CheckAnalysis.scala:331) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$7$adapted(CheckAnalysis.scala:313) ``` image ### Does this PR introduce _any_ user-facing change? Yes, When the value `ascendingOrder` in `SortArray` is `DataTypeMismatch`, the prompt is more `accurate`. ### How was this patch tested? - Add new UT - Pass GA. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #48082 from panbingkun/SPARK-49605. Authored-by: panbingkun Signed-off-by: Max Gekk --- .../expressions/collectionOperations.scala | 32 +++++++++++-------- .../analyzer-results/ansi/array.sql.out | 21 ++---------- .../sql-tests/analyzer-results/array.sql.out | 21 ++---------- .../sql-tests/results/ansi/array.sql.out | 22 ++----------- .../resources/sql-tests/results/array.sql.out | 22 ++----------- .../spark/sql/DataFrameFunctionsSuite.scala | 30 +++++++++++++++++ 6 files changed, 57 insertions(+), 91 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 5d5aece35383e..5cdd3c7eb62d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -1058,20 +1058,26 @@ case class SortArray(base: Expression, ascendingOrder: Expression) override def checkInputDataTypes(): TypeCheckResult = base.dataType match { case ArrayType(dt, _) if RowOrdering.isOrderable(dt) => - ascendingOrder match { - case Literal(_: Boolean, BooleanType) => - TypeCheckResult.TypeCheckSuccess - case _ => - DataTypeMismatch( - errorSubClass = "UNEXPECTED_INPUT_TYPE", - messageParameters = Map( - "paramIndex" -> ordinalNumber(1), - "requiredType" -> toSQLType(BooleanType), - "inputSql" -> toSQLExpr(ascendingOrder), - "inputType" -> toSQLType(ascendingOrder.dataType)) - ) + if (!ascendingOrder.foldable) { + DataTypeMismatch( + errorSubClass = "NON_FOLDABLE_INPUT", + messageParameters = Map( + "inputName" -> toSQLId("ascendingOrder"), + "inputType" -> toSQLType(ascendingOrder.dataType), + "inputExpr" -> toSQLExpr(ascendingOrder))) + } else if (ascendingOrder.dataType != BooleanType) { + DataTypeMismatch( + errorSubClass = "UNEXPECTED_INPUT_TYPE", + messageParameters = Map( + "paramIndex" -> ordinalNumber(1), + "requiredType" -> toSQLType(BooleanType), + "inputSql" -> toSQLExpr(ascendingOrder), + "inputType" -> toSQLType(ascendingOrder.dataType)) + ) + } else { + TypeCheckResult.TypeCheckSuccess } - case ArrayType(dt, _) => + case ArrayType(_, _) => DataTypeMismatch( errorSubClass = "INVALID_ORDERING_TYPE", messageParameters = Map( diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out index 57108c4582f45..53595d1b8a3eb 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/array.sql.out @@ -194,25 +194,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select sort_array(array('b', 'd'), cast(NULL as boolean)) -- !query analysis -org.apache.spark.sql.catalyst.ExtendedAnalysisException -{ - "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", - "sqlState" : "42K09", - "messageParameters" : { - "inputSql" : "\"CAST(NULL AS BOOLEAN)\"", - "inputType" : "\"BOOLEAN\"", - "paramIndex" : "second", - "requiredType" : "\"BOOLEAN\"", - "sqlExpr" : "\"sort_array(array(b, d), CAST(NULL AS BOOLEAN))\"" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 8, - "stopIndex" : 57, - "fragment" : "sort_array(array('b', 'd'), cast(NULL as boolean))" - } ] -} +Project [sort_array(array(b, d), cast(null as boolean)) AS sort_array(array(b, d), CAST(NULL AS BOOLEAN))#x] ++- OneRowRelation -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out index fb331089d7545..4db56d6c70561 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/array.sql.out @@ -194,25 +194,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select sort_array(array('b', 'd'), cast(NULL as boolean)) -- !query analysis -org.apache.spark.sql.catalyst.ExtendedAnalysisException -{ - "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", - "sqlState" : "42K09", - "messageParameters" : { - "inputSql" : "\"CAST(NULL AS BOOLEAN)\"", - "inputType" : "\"BOOLEAN\"", - "paramIndex" : "second", - "requiredType" : "\"BOOLEAN\"", - "sqlExpr" : "\"sort_array(array(b, d), CAST(NULL AS BOOLEAN))\"" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 8, - "stopIndex" : 57, - "fragment" : "sort_array(array('b', 'd'), cast(NULL as boolean))" - } ] -} +Project [sort_array(array(b, d), cast(null as boolean)) AS sort_array(array(b, d), CAST(NULL AS BOOLEAN))#x] ++- OneRowRelation -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out index d17d87900fc71..7394e428091c7 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out @@ -151,27 +151,9 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select sort_array(array('b', 'd'), cast(NULL as boolean)) -- !query schema -struct<> +struct> -- !query output -org.apache.spark.sql.catalyst.ExtendedAnalysisException -{ - "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", - "sqlState" : "42K09", - "messageParameters" : { - "inputSql" : "\"CAST(NULL AS BOOLEAN)\"", - "inputType" : "\"BOOLEAN\"", - "paramIndex" : "second", - "requiredType" : "\"BOOLEAN\"", - "sqlExpr" : "\"sort_array(array(b, d), CAST(NULL AS BOOLEAN))\"" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 8, - "stopIndex" : 57, - "fragment" : "sort_array(array('b', 'd'), cast(NULL as boolean))" - } ] -} +NULL -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out index 92da0a490ff81..c1330c620acfb 100644 --- a/sql/core/src/test/resources/sql-tests/results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out @@ -151,27 +151,9 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException -- !query select sort_array(array('b', 'd'), cast(NULL as boolean)) -- !query schema -struct<> +struct> -- !query output -org.apache.spark.sql.catalyst.ExtendedAnalysisException -{ - "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", - "sqlState" : "42K09", - "messageParameters" : { - "inputSql" : "\"CAST(NULL AS BOOLEAN)\"", - "inputType" : "\"BOOLEAN\"", - "paramIndex" : "second", - "requiredType" : "\"BOOLEAN\"", - "sqlExpr" : "\"sort_array(array(b, d), CAST(NULL AS BOOLEAN))\"" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 8, - "stopIndex" : 57, - "fragment" : "sort_array(array('b', 'd'), cast(NULL as boolean))" - } ] -} +NULL -- !query diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index d488adc5ac3d1..f16171940df21 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -964,6 +964,36 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { queryContext = Array(ExpectedContext("", "", 0, 12, "sort_array(a)")) ) + val df4 = Seq((Array[Int](2, 1, 3), true), (Array.empty[Int], false)).toDF("a", "b") + checkError( + exception = intercept[AnalysisException] { + df4.selectExpr("sort_array(a, b)").collect() + }, + condition = "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT", + sqlState = "42K09", + parameters = Map( + "inputName" -> "`ascendingOrder`", + "inputType" -> "\"BOOLEAN\"", + "inputExpr" -> "\"b\"", + "sqlExpr" -> "\"sort_array(a, b)\""), + context = ExpectedContext(fragment = "sort_array(a, b)", start = 0, stop = 15) + ) + + checkError( + exception = intercept[AnalysisException] { + df4.selectExpr("sort_array(a, 'A')").collect() + }, + condition = "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE", + sqlState = "42K09", + parameters = Map( + "sqlExpr" -> "\"sort_array(a, A)\"", + "paramIndex" -> "second", + "inputSql" -> "\"A\"", + "inputType" -> "\"STRING\"", + "requiredType" -> "\"BOOLEAN\""), + context = ExpectedContext(fragment = "sort_array(a, 'A')", start = 0, stop = 17) + ) + checkAnswer( df.select(array_sort($"a"), array_sort($"b")), Seq(