apache · sumitsingh-in · Jun 10, 2024 · Mar 6, 2025 · Mar 6, 2025
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -124,7 +124,7 @@ object ExtractGroupingPythonUDFFromAggregate extends Rule[LogicalPlan] {
       }
     }
     val aggExpr = agg.aggregateExpressions.map { expr =>
-      expr.transformUp {
+      expr.transformDown {
         // PythonUDF over aggregate was pull out by ExtractPythonUDFFromAggregate.
         // PythonUDF here should be either
         // 1. Argument of an aggregate function.

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDFSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.execution.python
 
 import org.apache.spark.sql.{AnalysisException, IntegratedUDFTestUtils, QueryTest, Row}
-import org.apache.spark.sql.functions.{array, col, count, transform}
+import org.apache.spark.sql.functions.{array, col, count, countDistinct, transform}
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types.LongType
 
@@ -139,4 +139,34 @@ class PythonUDFSuite extends QueryTest with SharedSparkSession {
       checkAnswer(df, Row(0, 1, 1, 0, 1, 1))
     }
   }
+
+  test("SPARK-48311: Nested pythonUDF in groupBy and aggregate") {
+    assume(shouldTestPythonUDFs)
+    withTempView("testCacheTable") {
+      // Define data
+      val data = Seq(Some("1")).toDF("col3")
+      data.createOrReplaceTempView("testCacheTable")
+      val df = spark.sql("SELECT DISTINCT col3 FROM testCacheTable")
+      // Define groupBy columns
+      val groupByCols = Seq("col4", "col5", "col3")
+
+      val pythonTestUDF1 = TestPythonUDF(name = "pyUDF1")
+      val pythonTestUDF2 = TestPythonUDF(name = "pyUDF2")
+      // Apply transformations
+      val df1 = df
+        .withColumn("col4", pythonTestUDF1(df("col3")))
+      val resultPython = df1.withColumn("col5", pythonTestUDF2(df1("col4")))
+        .groupBy(groupByCols.head, groupByCols.tail: _*).agg(countDistinct("col5").alias("col6"))
+
+      val scalaTestUDF = TestScalaUDF(name = "scalaUDF")
+      val scalaTestUDF1 = TestScalaUDF(name = "scalaUDF1")
+      // Apply transformations
+      val df2 = df
+        .withColumn("col4", scalaTestUDF(df("col3")))
+      val resultScala = df2.withColumn("col5", scalaTestUDF1(df2("col4")))
+        .groupBy(groupByCols.head, groupByCols.tail: _*).agg(countDistinct("col5").alias("col6"))
+
+      checkAnswer(resultScala, resultPython)
+    }
+  }
 }