Merge remote-tracking branch 'apache/master' into size_in_bytes_api

apache · Aug 20, 2024 · 664edc4 · 664edc4
2 parents 0f2b91b + ba208b9
commit 664edc4
Show file tree

Hide file tree

Showing 534 changed files with 31,058 additions and 13,394 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -867,20 +867,26 @@ jobs:
         #   - docs/README.md
         gem install bundler -v 2.4.22
         cd docs
-        bundle install
+        bundle install --retry=100
     - name: Run documentation build
       run: |
         # We need this link to make sure `python3` points to `python3.9` which contains the prerequisite packages.
         ln -s "$(which python3.9)" "/usr/local/bin/python3"
         # Build docs first with SKIP_API to ensure they are buildable without requiring any
         # language docs to be built beforehand.
-        cd docs; SKIP_API=1 bundle exec jekyll build; cd ..
+        cd docs; SKIP_ERRORDOC=1 SKIP_API=1 bundle exec jekyll build; cd ..
         if [ -f "./dev/is-changed.py" ]; then
           # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
           pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
           if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi
           if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi
         fi
+        # Print the values of environment variables `SKIP_ERRORDOC`, `SKIP_SCALADOC`, `SKIP_PYTHONDOC`, `SKIP_RDOC` and `SKIP_SQLDOC`
+        echo "SKIP_ERRORDOC: $SKIP_ERRORDOC"
+        echo "SKIP_SCALADOC: $SKIP_SCALADOC"
+        echo "SKIP_PYTHONDOC: $SKIP_PYTHONDOC"
+        echo "SKIP_RDOC: $SKIP_RDOC"
+        echo "SKIP_SQLDOC: $SKIP_SQLDOC"
         cd docs
         bundle exec jekyll build
     - name: Tar documentation

diff --git a/.github/workflows/build_maven.yml b/.github/workflows/build_maven.yml
@@ -30,8 +30,3 @@ jobs:
     name: Run
     uses: ./.github/workflows/maven_test.yml
     if: github.repository == 'apache/spark'
-    with:
-      envs: >-
-        {
-          "SKIP_SPARK_RELEASE_VERSIONS": "3.4.2"
-        }
diff --git a/LICENSE-binary b/LICENSE-binary
@@ -422,6 +422,11 @@ Python Software Foundation License
 python/pyspark/loose_version.py
 
 
+BSD 0-Clause
+------------
+org.tukaani:xz
+
+
 BSD 2-Clause
 ------------
 com.github.luben:zstd-jni
@@ -508,7 +513,6 @@ Eclipse Distribution License (EDL) 1.0
 com.sun.istack:istack-commons-runtime
 jakarta.xml.bind:jakarta.xml.bind-api
 org.glassfish.jaxb:jaxb-runtime
-org.glassfish.jaxb:txw2
 
 Eclipse Public License (EPL) 2.0
 --------------------------------
@@ -521,12 +525,6 @@ org.glassfish.hk2:hk2-locator
 org.glassfish.hk2:hk2-utils
 org.glassfish.hk2:osgi-resource-locator
 
-
-Public Domain
--------------
-org.tukaani:xz
-
-
 Creative Commons CC0 1.0 Universal Public Domain Dedication
 -----------------------------------------------------------
 (see LICENSE-CC0.txt)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -3965,19 +3965,11 @@ setMethod("row_number",
 #'        yields unresolved \code{a.b.c}
 #' @return Column object wrapping JVM UnresolvedNamedLambdaVariable
 #' @keywords internal
-unresolved_named_lambda_var <- function(...) {
-  jc <- newJObject(
-    "org.apache.spark.sql.Column",
-    newJObject(
-      "org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
-      lapply(list(...), function(x) {
-        handledCallJStatic(
-          "org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable",
-          "freshVarName",
-          x)
-      })
-    )
-  )
+unresolved_named_lambda_var <- function(name) {
+  jc <- handledCallJStatic(
+    "org.apache.spark.sql.api.python.PythonSQLUtils",
+    "unresolvedNamedLambdaVariable",
+    name)
   column(jc)
 }
 
@@ -3990,7 +3982,6 @@ unresolved_named_lambda_var <- function(...) {
 #' @return JVM \code{LambdaFunction} object
 #' @keywords internal
 create_lambda <- function(fun) {
-  as_jexpr <- function(x) callJMethod(x@jc, "expr")
 
   # Process function arguments
   parameters <- formals(fun)
@@ -4011,22 +4002,18 @@ create_lambda <- function(fun) {
   stopifnot(class(result) == "Column")
 
   # Convert both Columns to Scala expressions
-  jexpr <- as_jexpr(result)
-
   jargs <- handledCallJStatic(
     "org.apache.spark.api.python.PythonUtils",
     "toSeq",
-    handledCallJStatic(
-      "java.util.Arrays", "asList", lapply(args, as_jexpr)
-    )
+    handledCallJStatic("java.util.Arrays", "asList", lapply(args, function(x) { x@jc }))
   )
 
   # Create Scala LambdaFunction
-  newJObject(
-    "org.apache.spark.sql.catalyst.expressions.LambdaFunction",
-    jexpr,
-    jargs,
-    FALSE
+  handledCallJStatic(
+    "org.apache.spark.sql.api.python.PythonSQLUtils",
+    "lambdaFunction",
+    result@jc,
+    jargs
   )
 }
 
@@ -4039,20 +4026,18 @@ create_lambda <- function(fun) {
 #' @return a \code{Column} representing name applied to cols with funs
 #' @keywords internal
 invoke_higher_order_function <- function(name, cols, funs) {
-  as_jexpr <- function(x) {
+  as_col <- function(x) {
     if (class(x) == "character") {
       x <- column(x)
     }
-    callJMethod(x@jc, "expr")
+    x@jc
   }
-
-  jexpr <- do.call(newJObject, c(
-    paste("org.apache.spark.sql.catalyst.expressions", name, sep = "."),
-    lapply(cols, as_jexpr),
-    lapply(funs, create_lambda)
-  ))
-
-  column(newJObject("org.apache.spark.sql.Column", jexpr))
+  jcol <- handledCallJStatic(
+    "org.apache.spark.sql.api.python.PythonSQLUtils",
+    "fn",
+    name,
+    c(lapply(cols, as_col), lapply(funs, create_lambda))) # check varargs invocation
+  column(jcol)
 }
 
 #' @details
@@ -4068,7 +4053,7 @@ setMethod("array_aggregate",
           signature(x = "characterOrColumn", initialValue = "Column", merge = "function"),
           function(x, initialValue, merge, finish = NULL) {
             invoke_higher_order_function(
-              "ArrayAggregate",
+              "aggregate",
               cols = list(x, initialValue),
               funs = if (is.null(finish)) {
                 list(merge)
@@ -4129,7 +4114,7 @@ setMethod("array_exists",
           signature(x = "characterOrColumn", f = "function"),
           function(x, f) {
             invoke_higher_order_function(
-              "ArrayExists",
+              "exists",
               cols = list(x),
               funs = list(f)
             )
@@ -4145,7 +4130,7 @@ setMethod("array_filter",
           signature(x = "characterOrColumn", f = "function"),
           function(x, f) {
             invoke_higher_order_function(
-              "ArrayFilter",
+              "filter",
               cols = list(x),
               funs = list(f)
             )
@@ -4161,7 +4146,7 @@ setMethod("array_forall",
           signature(x = "characterOrColumn", f = "function"),
           function(x, f) {
             invoke_higher_order_function(
-              "ArrayForAll",
+              "forall",
               cols = list(x),
               funs = list(f)
             )
@@ -4291,7 +4276,7 @@ setMethod("array_sort",
                column(callJStatic("org.apache.spark.sql.functions", "array_sort", x@jc))
             } else {
               invoke_higher_order_function(
-                "ArraySort",
+                "array_sort",
                 cols = list(x),
                 funs = list(comparator)
               )
@@ -4309,7 +4294,7 @@ setMethod("array_transform",
           signature(x = "characterOrColumn", f = "function"),
           function(x, f) {
             invoke_higher_order_function(
-              "ArrayTransform",
+              "transform",
               cols = list(x),
               funs = list(f)
             )
@@ -4374,7 +4359,7 @@ setMethod("arrays_zip_with",
           signature(x = "characterOrColumn", y = "characterOrColumn", f = "function"),
           function(x, y, f) {
             invoke_higher_order_function(
-              "ZipWith",
+              "zip_with",
               cols = list(x, y),
               funs = list(f)
             )
@@ -4447,7 +4432,7 @@ setMethod("map_filter",
           signature(x = "characterOrColumn", f = "function"),
           function(x, f) {
             invoke_higher_order_function(
-              "MapFilter",
+              "map_filter",
               cols = list(x),
               funs = list(f))
           })
@@ -4504,7 +4489,7 @@ setMethod("transform_keys",
           signature(x = "characterOrColumn", f = "function"),
           function(x, f) {
             invoke_higher_order_function(
-              "TransformKeys",
+              "transform_keys",
               cols = list(x),
               funs = list(f)
             )
@@ -4521,7 +4506,7 @@ setMethod("transform_values",
           signature(x = "characterOrColumn", f = "function"),
           function(x, f) {
             invoke_higher_order_function(
-              "TransformValues",
+              "transform_values",
               cols = list(x),
               funs = list(f)
            )
@@ -4552,7 +4537,7 @@ setMethod("map_zip_with",
           signature(x = "characterOrColumn", y = "characterOrColumn", f = "function"),
           function(x, y, f) {
             invoke_higher_order_function(
-              "MapZipWith",
+              "map_zip_with",
               cols = list(x, y),
               funs = list(f)
            )

diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R
@@ -147,8 +147,7 @@ test_that("Unsupported operation", {
   # memory sink without aggregation
   df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
   expect_error(write.stream(df, "memory", queryName = "people", outputMode = "complete"),
-               paste0(".*(start : analysis error - Complete output mode not supported when there ",
-                      "are no streaming aggregations on streaming DataFrames/Datasets).*"))
+               ".*analysis error.*complete.*not supported.*no streaming aggregations*")
 })
 
 test_that("Terminated by error", {

diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -109,6 +109,16 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <!--
+      Included so Spark Connect client was compiled before triggering assembly.
+      See 'get-connect-client-jar' below. This will not be included in the packaging output.
+    -->
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-connect-client-jvm_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
 
     <!--
       Because we don't shade dependencies anymore, we need to restore Guava to compile scope so
@@ -159,6 +169,44 @@
             </target>
           </configuration>
       </plugin>
+      <plugin>
+        <!--
+          Copy Spark Connect client jar and its dependencies for Spark Connect REPL.
+        -->
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>exec-maven-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>copy-connect-client-repl-jars</id>
+            <phase>package</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>cp</executable>
+              <arguments>
+                <argument>-r</argument>
+                <argument>${basedir}/../connector/connect/client/jvm/target/connect-repl</argument>
+                <argument>${basedir}/target/scala-${scala.binary.version}/jars/</argument>
+              </arguments>
+            </configuration>
+          </execution>
+          <execution>
+            <id>copy-connect-client-jar</id>
+            <phase>package</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>cp</executable>
+              <arguments>
+                <argument>${basedir}/../connector/connect/client/jvm/target/spark-connect-client-jvm_${scala.binary.version}-${version}.jar</argument>
+                <argument>${basedir}/target/scala-${scala.binary.version}/jars/connect-repl</argument>
+              </arguments>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
   </build>
 

diff --git a/bin/spark-shell b/bin/spark-shell
@@ -34,7 +34,7 @@ fi
 
 export _SPARK_CMD_USAGE="Usage: ./bin/spark-shell [options]
 
-Scala REPL options:
+Scala REPL options, Spark Classic only:
   -I <file>                   preload <file>, enforcing line-by-line interpretation"
 
 # SPARK-4161: scala does not assume use of the java classpath,
@@ -45,6 +45,9 @@ Scala REPL options:
 SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Dscala.usejavacp=true"
 
 function main() {
+  export SPARK_SCALA_SHELL=1
+  # In case of Spark Connect shell, the main class (and resource) is replaced in
+  # SparkSubmitCommandBuilder.
   if $cygwin; then
     # Workaround for issue involving JLine and Cygwin
     # (see http://sourceforge.net/p/jline/bugs/40/).

diff --git a/build/mvn b/build/mvn
@@ -115,6 +115,10 @@ function version { echo "$@" | awk -F. '{ printf("%03d%03d%03d\n", $1,$2,$3); }'
 # install maven under the build/ folder if needed.
 install_mvn() {
   local MVN_VERSION=`grep "<maven.version>" "${_DIR}/../pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'`
+  MVN_BIN="${_DIR}/apache-maven-${MVN_VERSION}/bin/mvn"
+  if [ -f "$MVN_BIN" ]; then
+    return
+  fi
   MVN_BIN="$(command -v mvn)"
   if [ "$MVN_BIN" ]; then
     local MVN_DETECTED_VERSION="$(mvn --version | head -n1 | awk '{print $3}')"