returning pdf for initial state processing

apache · Sep 10, 2024 · fe5a9eb · fe5a9eb
1 parent a6f48f3
commit fe5a9eb
Show file tree

Hide file tree

Showing 7 changed files with 21 additions and 24 deletions.
diff --git a/python/pyspark/sql/pandas/group_ops.py b/python/pyspark/sql/pandas/group_ops.py
@@ -509,11 +509,9 @@ def transformWithStateUDF(
 
             # only process initial state if first batch
             batch_id = statefulProcessorApiClient.get_batch_id()
-            """
-            if batch_id == 0 and initialState is not None:
+            if batch_id == 0:
                 initial_state = statefulProcessorApiClient.get_initial_state(key)
                 statefulProcessor.handleInitialState(key, initial_state)
-            """
 
             statefulProcessorApiClient.set_implicit_key(key)
             result = statefulProcessor.handleInputRows(key, inputRows)

diff --git a/python/pyspark/sql/streaming/stateful_processor_api_client.py b/python/pyspark/sql/streaming/stateful_processor_api_client.py
@@ -147,7 +147,7 @@ def get_initial_state(self, key: Tuple) -> "PandasDataFrameLike":
         from pandas import DataFrame
         import pyspark.sql.streaming.StateMessage_pb2 as stateMessage
 
-        bytes = self._stateful_processor_api_client._serialize_to_bytes(self.key_schema, key)
+        bytes = self._serialize_to_bytes(self.key_schema, key)
 
         get_initial_state = stateMessage.GetInitialState(value=bytes)
         request = stateMessage.UtilsCallCommand(getInitialState=get_initial_state)
@@ -159,12 +159,12 @@ def get_initial_state(self, key: Tuple) -> "PandasDataFrameLike":
         status = response_message[0]
         if status == 1:
             DataFrame()
-        if status == 0:
-            iterator = self._stateful_processor_api_client._read_arrow_state()
+        elif status == 0:
+            iterator = self._read_arrow_state()
             batch = next(iterator)
             return batch.to_pandas()
         else:
-            raise StopIteration()
+            raise PySparkRuntimeError(f"Error getting initial state: " f"{response_message[1]}")
 
     def _send_proto_message(self, message: bytes) -> None:
         # Writing zero here to indicate message version. This allows us to evolve the message

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py b/python/pyspark/sql/tests/pandas/test_pandas_transform_with_state.py
@@ -308,7 +308,7 @@ def handleInputRows(self, key, rows) -> Iterator[pd.DataFrame]:
         yield pd.DataFrame({"id": key, "countAsString": str(count)})
 
     def handleInitialState(self, key, initialState) -> None:
-        pass
+        raise Exception(f"I am inside handleInitialState, init state: {initialState.get('sth')}")
 
     def close(self) -> None:
         pass

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -511,10 +511,17 @@ class RelationalGroupedDataset protected[sql](
       val leftChild = df.logicalPlan
       val rightChild = initialState.df.logicalPlan
 
+      /*
       val left = df.sparkSession.sessionState.executePlan(
         Project(groupingAttrs ++ leftChild.output, leftChild)).analyzed
       val right = initialState.df.sparkSession.sessionState.executePlan(
-        Project(initGroupingAttrs ++ rightChild.output, rightChild)).analyzed
+        Project(initGroupingAttrs ++ rightChild.output, rightChild)).analyzed */
+
+      val left = df.sparkSession.sessionState.executePlan(
+        Project(leftChild.output, leftChild)).analyzed
+      val right = initialState.df.sparkSession.sessionState.executePlan(
+        Project(rightChild.output, rightChild)).analyzed
+
 
       TransformWithStateInPandas(
         func.expr,

diff --git a/...src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala b/...src/main/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasExec.scala
@@ -209,7 +209,6 @@ case class TransformWithStateInPandasExec(
       jobArtifactUUID,
       groupingKeySchema,
       hasInitialState,
-      initialStateGroupingAttrs,
       initialStateSchema,
       initStateIterator
     )

diff --git a/.../scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasPythonRunner.scala b/.../scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasPythonRunner.scala
@@ -29,7 +29,6 @@ import org.apache.spark.TaskContext
 import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions, PythonRDD}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.TransformWithStateInPandasPythonRunner.{InType, OutType}
 import org.apache.spark.sql.execution.streaming.StatefulProcessorHandleImpl
@@ -53,7 +52,6 @@ class TransformWithStateInPandasPythonRunner(
     jobArtifactUUID: Option[String],
     groupingKeySchema: StructType,
     hasInitialState: Boolean,
-    initialStateGroupingAttrs: Seq[Attribute],
     initialStateSchema: StructType,
     initialStateDataIterator: Iterator[(InternalRow, Iterator[InternalRow])])
   extends BasePythonRunner[InType, OutType](funcs.map(_._1), evalType, argOffsets, jobArtifactUUID)
@@ -111,7 +109,6 @@ class TransformWithStateInPandasPythonRunner(
         groupingKeySchema, timeZoneId, errorOnDuplicatedFieldNames, largeVarTypes,
         arrowMaxRecordsPerBatch,
         hasInitialState = hasInitialState,
-        initialStateGroupingAttrs = initialStateGroupingAttrs,
         initialStateSchema = initialStateSchema,
         initialStateDataIterator = initialStateDataIterator))
 

diff --git a/...n/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServer.scala b/...n/scala/org/apache/spark/sql/execution/python/TransformWithStateInPandasStateServer.scala
@@ -31,7 +31,6 @@ import org.apache.spark.sql.{Encoders, Row}
 import org.apache.spark.sql.api.python.PythonSQLUtils
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
-import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.execution.streaming.{ImplicitGroupingKeyTracker, StatefulProcessorHandleImpl, StatefulProcessorHandleState}
 import org.apache.spark.sql.execution.streaming.state.StateMessage.{HandleState, ImplicitGroupingKeyRequest, StatefulProcessorCall, StateRequest, StateResponse, StateVariableRequest, UtilsCallCommand, ValueStateCall}
 import org.apache.spark.sql.execution.streaming.state.StateStoreErrors
@@ -61,7 +60,6 @@ class TransformWithStateInPandasStateServer(
     valueStateMapForTest: mutable.HashMap[String,
       (ValueState[Row], StructType, ExpressionEncoder.Deserializer[Row])] = null,
     hasInitialState: Boolean,
-    initialStateGroupingAttrs: Seq[Attribute],
     initialStateSchema: StructType,
     initialStateDataIterator: Iterator[(InternalRow, Iterator[InternalRow])])
   extends Runnable with Logging {
@@ -187,14 +185,11 @@ class TransformWithStateInPandasStateServer(
         sendResponse(0, null, ByteString.copyFromUtf8(valueStr))
 
       case UtilsCallCommand.MethodCase.GETINITIALSTATE =>
-        val keyBytes = message.getGetInitialState.getValue.toByteArray
-        // The key row is serialized as a byte array, we need to convert it back to a Row
-        val keyRow = PythonSQLUtils.toJVMRow(keyBytes, groupingKeySchema, keyRowDeserializer)
-        if (!initialStateDataIterator.isEmpty || !initialStateDataIterator.hasNext) {
+        if (!hasInitialState || initialStateKeyToRowMap.isEmpty) {
           sendResponse(1)
         } else {
           sendResponse(0)
-          // TODO check if has initial state
+
           outputStream.flush()
           val arrowStreamWriter = {
             val outputSchema = initialStateSchema
@@ -207,21 +202,22 @@ class TransformWithStateInPandasStateServer(
               arrowTransformWithStateInPandasMaxRecordsPerBatch)
           }
 
+          val keyBytes = message.getGetInitialState.getValue.toByteArray
+          // The key row is serialized as a byte array, we need to convert it back to a Row
+          val keyRow = PythonSQLUtils.toJVMRow(keyBytes, groupingKeySchema, keyRowDeserializer)
           val groupingKeyToInternalRow =
             ExpressionEncoder(groupingKeySchema).createSerializer().apply(keyRow)
-
-          throw new Exception(s"I am inside initial state processing, grouping key row" +
-          s"received: ${keyRow}")
-
           val iter = initialStateKeyToRowMap
             .get(groupingKeyToInternalRow).getOrElse(Iterator.empty)
+
           var seenInitStateOnKey = false
           while (iter.hasNext) {
             if (seenInitStateOnKey) {
               throw StateStoreErrors.cannotReInitializeStateOnKey(
                 keyRowDeserializer.apply(groupingKeyToInternalRow).toString)
             } else {
               val initialStateRow = iter.next()
+              seenInitStateOnKey = true
               arrowStreamWriter.writeRow(initialStateRow)
             }
           }