-
Notifications
You must be signed in to change notification settings - Fork 234
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Start adding SQL examples to illustrate partioned/non-partioned joins (…
…#110) * Start working on adding SQL examples + thing to run them for CI Work on script to run the sql examples. Add missing steps Add missing runson * Add partitioned/non-partitioned join examples. * Fix cache etc. Fix shellcheck Fix cache etc. * Rename run ex * Use java 17 for style. * Checkout needed. * Fix shellcheck * Try and fix cache of spark DL * Fix style with run_sql_examples
- Loading branch information
Showing
8 changed files
with
143 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#!/bin/bash | ||
set -ex | ||
|
||
# Download Spark and iceberg if not present | ||
SPARK_MAJOR="3.4" | ||
SPARK_VERSION=3.4.1 | ||
HADOOP_VERSION="3" | ||
SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" | ||
SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" | ||
ICEBERG_VERSION="1.3.1" | ||
if [ ! -f "${SPARK_FILE}" ]; then | ||
wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" & | ||
fi | ||
# Download Icberg if not present | ||
ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_2.13-${ICEBERG_VERSION}.jar" | ||
if [ ! -f "${ICEBERG_FILE}" ]; then | ||
wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_2.13/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & | ||
fi | ||
wait | ||
# Setup the env | ||
if [ ! -d "${SPARK_PATH}" ]; then | ||
tar -xf ${SPARK_FILE} | ||
fi | ||
if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then | ||
cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}" | ||
fi | ||
|
||
# Set up for running pyspark and friends | ||
export PATH=${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH} | ||
|
||
# Make sure we have a history directory | ||
mkdir -p /tmp/spark-events | ||
|
||
# We use `` for mid multi-line command comments. (see https://stackoverflow.com/questions/9522631/how-to-put-a-line-comment-for-a-multi-line-command). | ||
# For each SQL | ||
for sql_file in sql/*.sql; do | ||
echo "Processing ${sql_file}" | ||
# shellcheck disable=SC2046 | ||
spark-sql --master local[5] \ | ||
--conf spark.eventLog.enabled=true \ | ||
--conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ | ||
--conf spark.sql.catalog.spark_catalog.type=hive \ | ||
--conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ | ||
--conf spark.sql.catalog.local.type=hadoop \ | ||
--conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ | ||
$(cat "${sql_file}.conf" || echo "") \ | ||
--name "${sql_file}" \ | ||
-f "${sql_file}" | tee -a "${sql_file}.out" | ||
done | ||
|
||
# If you want to look at them | ||
# ${SPARK_PATH}/sbin/start-history-server.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
CREATE TABLE IF NOT EXISTS local.udevelopers ( | ||
username string, | ||
firstname string, | ||
lastname string) | ||
USING iceberg; | ||
CREATE TABLE IF NOT EXISTS local.uprojects ( | ||
creator string, | ||
uprojectname string) | ||
USING iceberg; | ||
INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova"); | ||
INSERT INTO local.uprojects VALUES("krisnova", "aurae"); | ||
SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
--conf spark.sql.sources.v2.bucketing.enabled=true | ||
--conf spark.sql.iceberg.planning.preserve-data-grouping=true | ||
--conf spark.sql.requireAllClusterKeysForCoPartition=false | ||
|
||
--conf spark.sql.adaptive.enabled=false | ||
--conf spark.sql.autoBroadcastJoinThreshold=-1 | ||
--conf spark.sql.shuffle.partitions=4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
CREATE TABLE IF NOT EXISTS local.developers ( | ||
username string, | ||
firstname string, | ||
lastname string) | ||
USING iceberg | ||
PARTITIONED BY (username); | ||
CREATE TABLE IF NOT EXISTS local.projects ( | ||
creator string, | ||
projectname string) | ||
USING iceberg | ||
PARTITIONED BY (creator); | ||
INSERT INTO local.developers VALUES("krisnova", "Kris", "Nova"); | ||
INSERT INTO local.projects VALUES("krisnova", "aurae"); | ||
SELECT * FROM local.developers INNER JOIN local.projects ON local.projects.creator = local.developers.username; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
--conf spark.sql.sources.v2.bucketing.enabled=true | ||
--conf spark.sql.iceberg.planning.preserve-data-grouping=true | ||
--conf spark.sql.requireAllClusterKeysForCoPartition=false | ||
|
||
--conf spark.sql.adaptive.enabled=false | ||
--conf spark.sql.autoBroadcastJoinThreshold=-1 | ||
--conf spark.sql.shuffle.partitions=4 |