Start adding SQL examples to illustrate partioned/non-partioned joins (…

…#110) * Start working on adding SQL examples + thing to run them for CI Work on script to run the sql examples. Add missing steps Add missing runson * Add partitioned/non-partitioned join examples. * Fix cache etc. Fix shellcheck Fix cache etc. * Rename run ex * Use java 17 for style. * Checkout needed. * Fix shellcheck * Try and fix cache of spark DL * Fix style with run_sql_examples
high-performance-spark · Aug 29, 2023 · 6442015 · 6442015
1 parent 3c48c54
commit 6442015
Show file tree

Hide file tree

Showing 8 changed files with 143 additions and 1 deletion.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
     steps:
     - name: Checkout
       uses: actions/checkout@v2
-    - name: Sync the current branch with the latest in spark-testing-base
+    - name: Sync the current branch with the latest
       if: github.repository != 'high-performance-spark/high-performance-spark-examples'
       id: sync-branch
       run: |
@@ -43,3 +43,36 @@ jobs:
     - name: Run tox
       run: |
         cd python; tox
+  run-sql-examples:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Cache Spark and friends
+      uses: actions/cache@v3
+      with:
+        path: |
+          spark*.tgz
+          iceberg*.jar
+        key: spark-artifacts
+    - name: Run sql examples
+      run:
+        ./run_sql_examples.sh
+  style:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Shellcheck
+      run: |
+        sudo apt-get install -y shellcheck
+        shellcheck $(find -name "*.sh")
+    - name: Setup JDK
+      uses: actions/setup-java@v3
+      with:
+        distribution: temurin
+        java-version: 17
+        cache: sbt
+    - name: scala
+      run:
+        sbt scalastyle
diff --git a/.gitignore b/.gitignore
@@ -57,9 +57,24 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+# scala stuff
+.metals
 
 # native
 *.o
 *.so
 *.so.0.0.0
 *.so.0
+
+# Spark files
+*.tgz
+iceberg-spark-runtime-*.jar
+spark-*-bin-hadoop*/
+
+# Warehouse
+spark-warehouse/
+warehouse/
+metastore_db/
+
+# Misc internal stuff
+sql/*.sql.out
diff --git a/migration/sql.sh b/migration/sql.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 pip install sqlfluff
 python -m pip install 'sqlfluff-plugin-sparksql-upgrade @ git+https://github.com/holdenk/spark-upgrade#subdirectory=sql'
 

diff --git a/run_sql_examples.sh b/run_sql_examples.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+set -ex
+
+# Download Spark and iceberg if not present
+SPARK_MAJOR="3.4"
+SPARK_VERSION=3.4.1
+HADOOP_VERSION="3"
+SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
+SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz"
+ICEBERG_VERSION="1.3.1"
+if [ ! -f "${SPARK_FILE}" ]; then
+  wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" &
+fi
+# Download Icberg if not present
+ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_2.13-${ICEBERG_VERSION}.jar"
+if [ ! -f "${ICEBERG_FILE}" ]; then
+  wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_2.13/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" &
+fi
+wait
+# Setup the env
+if [ ! -d "${SPARK_PATH}" ]; then
+  tar -xf ${SPARK_FILE}
+fi
+if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then
+  cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}"
+fi
+
+# Set up for running pyspark and friends
+export PATH=${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH}
+
+# Make sure we have a history directory
+mkdir -p /tmp/spark-events
+
+# We use `` for mid multi-line command comments. (see https://stackoverflow.com/questions/9522631/how-to-put-a-line-comment-for-a-multi-line-command).
+# For each SQL
+for sql_file in sql/*.sql; do
+  echo "Processing ${sql_file}"
+  # shellcheck disable=SC2046
+  spark-sql --master local[5] \
+	    --conf spark.eventLog.enabled=true \
+	    --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
+	    --conf spark.sql.catalog.spark_catalog.type=hive \
+	    --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
+	    --conf spark.sql.catalog.local.type=hadoop \
+	    --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \
+	    $(cat "${sql_file}.conf" || echo "") \
+	    --name "${sql_file}" \
+	    -f "${sql_file}" | tee -a "${sql_file}.out"
+done
+
+# If you want to look at them
+# ${SPARK_PATH}/sbin/start-history-server.sh
diff --git a/sql/nonpartitioned_table_join.sql b/sql/nonpartitioned_table_join.sql
@@ -0,0 +1,12 @@
+CREATE TABLE IF NOT EXISTS local.udevelopers (
+       username string,
+       firstname string,
+       lastname string)
+USING iceberg;
+CREATE TABLE IF NOT EXISTS local.uprojects (
+       creator string,
+       uprojectname string)
+USING iceberg;
+INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova");
+INSERT INTO local.uprojects VALUES("krisnova", "aurae");
+SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username;
diff --git a/sql/nonpartitioned_table_join.sql.conf b/sql/nonpartitioned_table_join.sql.conf
@@ -0,0 +1,7 @@
+	    --conf spark.sql.sources.v2.bucketing.enabled=true
+	    --conf spark.sql.iceberg.planning.preserve-data-grouping=true
+	    --conf spark.sql.requireAllClusterKeysForCoPartition=false
+
+	    --conf spark.sql.adaptive.enabled=false
+	    --conf spark.sql.autoBroadcastJoinThreshold=-1
+	    --conf spark.sql.shuffle.partitions=4
diff --git a/sql/partioned_table_join.sql b/sql/partioned_table_join.sql
@@ -0,0 +1,14 @@
+CREATE TABLE IF NOT EXISTS local.developers (
+       username string,
+       firstname string,
+       lastname string)
+USING iceberg
+PARTITIONED BY (username);
+CREATE TABLE IF NOT EXISTS local.projects (
+       creator string,
+       projectname string)
+USING iceberg
+PARTITIONED BY (creator);
+INSERT INTO local.developers VALUES("krisnova", "Kris", "Nova");
+INSERT INTO local.projects VALUES("krisnova", "aurae");
+SELECT * FROM local.developers INNER JOIN local.projects ON local.projects.creator = local.developers.username;
diff --git a/sql/partioned_table_join.sql.conf b/sql/partioned_table_join.sql.conf
@@ -0,0 +1,7 @@
+	    --conf spark.sql.sources.v2.bucketing.enabled=true
+	    --conf spark.sql.iceberg.planning.preserve-data-grouping=true
+	    --conf spark.sql.requireAllClusterKeysForCoPartition=false
+
+	    --conf spark.sql.adaptive.enabled=false
+	    --conf spark.sql.autoBroadcastJoinThreshold=-1
+	    --conf spark.sql.shuffle.partitions=4