Skip to content

Commit

Permalink
Start adding SQL examples to illustrate partioned/non-partioned joins (
Browse files Browse the repository at this point in the history
…#110)

* Start working on adding SQL examples + thing to run them for CI

Work on script to run the sql examples.

Add missing steps

Add missing runson

* Add partitioned/non-partitioned join examples.

* Fix cache etc.

Fix shellcheck

Fix cache etc.

* Rename run ex

* Use java 17 for style.

* Checkout needed.

* Fix shellcheck

* Try and fix cache of spark DL

* Fix style with run_sql_examples
  • Loading branch information
holdenk authored Aug 29, 2023
1 parent 3c48c54 commit 6442015
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 1 deletion.
35 changes: 34 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Sync the current branch with the latest in spark-testing-base
- name: Sync the current branch with the latest
if: github.repository != 'high-performance-spark/high-performance-spark-examples'
id: sync-branch
run: |
Expand Down Expand Up @@ -43,3 +43,36 @@ jobs:
- name: Run tox
run: |
cd python; tox
run-sql-examples:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Cache Spark and friends
uses: actions/cache@v3
with:
path: |
spark*.tgz
iceberg*.jar
key: spark-artifacts
- name: Run sql examples
run:
./run_sql_examples.sh
style:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Shellcheck
run: |
sudo apt-get install -y shellcheck
shellcheck $(find -name "*.sh")
- name: Setup JDK
uses: actions/setup-java@v3
with:
distribution: temurin
java-version: 17
cache: sbt
- name: scala
run:
sbt scalastyle
15 changes: 15 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,24 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
# scala stuff
.metals

# native
*.o
*.so
*.so.0.0.0
*.so.0

# Spark files
*.tgz
iceberg-spark-runtime-*.jar
spark-*-bin-hadoop*/

# Warehouse
spark-warehouse/
warehouse/
metastore_db/

# Misc internal stuff
sql/*.sql.out
2 changes: 2 additions & 0 deletions migration/sql.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash

pip install sqlfluff
python -m pip install 'sqlfluff-plugin-sparksql-upgrade @ git+https://github.com/holdenk/spark-upgrade#subdirectory=sql'

Expand Down
52 changes: 52 additions & 0 deletions run_sql_examples.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash
set -ex

# Download Spark and iceberg if not present
SPARK_MAJOR="3.4"
SPARK_VERSION=3.4.1
HADOOP_VERSION="3"
SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz"
ICEBERG_VERSION="1.3.1"
if [ ! -f "${SPARK_FILE}" ]; then
wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" &
fi
# Download Icberg if not present
ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_2.13-${ICEBERG_VERSION}.jar"
if [ ! -f "${ICEBERG_FILE}" ]; then
wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_2.13/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" &
fi
wait
# Setup the env
if [ ! -d "${SPARK_PATH}" ]; then
tar -xf ${SPARK_FILE}
fi
if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then
cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}"
fi

# Set up for running pyspark and friends
export PATH=${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH}

# Make sure we have a history directory
mkdir -p /tmp/spark-events

# We use `` for mid multi-line command comments. (see https://stackoverflow.com/questions/9522631/how-to-put-a-line-comment-for-a-multi-line-command).
# For each SQL
for sql_file in sql/*.sql; do
echo "Processing ${sql_file}"
# shellcheck disable=SC2046
spark-sql --master local[5] \
--conf spark.eventLog.enabled=true \
--conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
--conf spark.sql.catalog.spark_catalog.type=hive \
--conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.local.type=hadoop \
--conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \
$(cat "${sql_file}.conf" || echo "") \
--name "${sql_file}" \
-f "${sql_file}" | tee -a "${sql_file}.out"
done

# If you want to look at them
# ${SPARK_PATH}/sbin/start-history-server.sh
12 changes: 12 additions & 0 deletions sql/nonpartitioned_table_join.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
CREATE TABLE IF NOT EXISTS local.udevelopers (
username string,
firstname string,
lastname string)
USING iceberg;
CREATE TABLE IF NOT EXISTS local.uprojects (
creator string,
uprojectname string)
USING iceberg;
INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova");
INSERT INTO local.uprojects VALUES("krisnova", "aurae");
SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username;
7 changes: 7 additions & 0 deletions sql/nonpartitioned_table_join.sql.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
--conf spark.sql.sources.v2.bucketing.enabled=true
--conf spark.sql.iceberg.planning.preserve-data-grouping=true
--conf spark.sql.requireAllClusterKeysForCoPartition=false

--conf spark.sql.adaptive.enabled=false
--conf spark.sql.autoBroadcastJoinThreshold=-1
--conf spark.sql.shuffle.partitions=4
14 changes: 14 additions & 0 deletions sql/partioned_table_join.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
CREATE TABLE IF NOT EXISTS local.developers (
username string,
firstname string,
lastname string)
USING iceberg
PARTITIONED BY (username);
CREATE TABLE IF NOT EXISTS local.projects (
creator string,
projectname string)
USING iceberg
PARTITIONED BY (creator);
INSERT INTO local.developers VALUES("krisnova", "Kris", "Nova");
INSERT INTO local.projects VALUES("krisnova", "aurae");
SELECT * FROM local.developers INNER JOIN local.projects ON local.projects.creator = local.developers.username;
7 changes: 7 additions & 0 deletions sql/partioned_table_join.sql.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
--conf spark.sql.sources.v2.bucketing.enabled=true
--conf spark.sql.iceberg.planning.preserve-data-grouping=true
--conf spark.sql.requireAllClusterKeysForCoPartition=false

--conf spark.sql.adaptive.enabled=false
--conf spark.sql.autoBroadcastJoinThreshold=-1
--conf spark.sql.shuffle.partitions=4

0 comments on commit 6442015

Please sign in to comment.