diff --git a/.gitignore b/.gitignore index 30c8d69e..33b190cc 100644 --- a/.gitignore +++ b/.gitignore @@ -80,3 +80,6 @@ scripts/armadactl e2e-test.log extraJars/*.jar scripts/.tmp/ + +# Jupyter +example/jupyter/workspace/ diff --git a/README.md b/README.md index b5a2e469..dc7feef2 100644 --- a/README.md +++ b/README.md @@ -151,3 +151,21 @@ The project includes a ready-to-use Spark job to test your setup: This job leverages the same configuration parameters (`ARMADA_MASTER`, `ARMADA_QUEUE`, `ARMADA_LOOKOUT_URL`) as the `scripts/config.sh` script. Use the -h option to see what other options are available. + +### Jupyter Notebook + +The Docker image includes Jupyter support. Run Jupyter with the example notebooks: + +```bash +./scripts/runJupyter.sh +``` + +**Note:** The Docker image must be built with `INCLUDE_PYTHON=true` for Jupyter to work. + +This will start a Jupyter notebook server at `http://localhost:8888` (or the port specified by `JUPYTER_PORT` in `scripts/config.sh`). +The example notebooks from `example/jupyter/notebooks` are mounted in the container at `/home/spark/workspace/notebooks`. + +**Configuration:** +- **Required:** `SPARK_DRIVER_HOST` +- Override the Jupyter port if required by setting `JUPYTER_PORT` in `scripts/config.sh` +- The script uses the same configuration (`ARMADA_MASTER`, `ARMADA_QUEUE`, `SPARK_DRIVER_HOST`, etc.) as other scripts diff --git a/docker/Dockerfile b/docker/Dockerfile index 1bca862b..50b33c76 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,10 +20,13 @@ ARG spark_base_image_tag=3.3.3-scala2.12-java11-ubuntu FROM ${spark_base_image_prefix}:${spark_base_image_tag} ARG scala_binary_version=2.13 +ARG spark_version=3.3.3 +ARG include_python=false COPY target/armada-cluster-manager_${scala_binary_version}-*-all.jar /opt/spark/jars/ COPY extraFiles /opt/spark/extraFiles COPY extraJars/* /opt/spark/jars +COPY docker/jupyter-entrypoint.sh /opt/spark/bin/jupyter-entrypoint.sh USER 0 @@ -34,5 +37,41 @@ RUN mkdir -p /opt/spark/coreJars && \ ENV SPARK_DIST_CLASSPATH=/opt/spark/coreJars/* +# Install Jupyter, PySpark, and Python dependencies (only if include_python is true) +RUN if [ "$include_python" = "true" ]; then \ + apt-get update && \ + apt-get install -y python3-pip && \ + pip3 install --no-cache-dir \ + jupyter \ + notebook \ + ipykernel \ + pyspark==${spark_version} && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/*; \ + fi + + +RUN if [ "$include_python" = "true" ]; then \ + mkdir -p /home/spark/workspace && \ + mkdir -p /home/spark/.local/share/jupyter && \ + mkdir -p /home/spark/.jupyter && \ + chown -R 185:185 /home/spark/workspace && \ + chown -R 185:185 /home/spark/.local && \ + chown -R 185:185 /home/spark/.jupyter; \ + fi && \ + chmod +x /opt/spark/bin/jupyter-entrypoint.sh + ARG spark_uid=185 USER ${spark_uid} + +# Install ipykernel (only if include_python is true) +RUN if [ "$include_python" = "true" ]; then \ + HOME=/home/spark python3 -m ipykernel install --user --name python3 --display-name "Python 3"; \ + fi + +ENV HOME=/home/spark +ENV SPARK_HOME=/opt/spark +ENV PYSPARK_PYTHON=python3 +ENV PYSPARK_DRIVER_PYTHON=python3 +ENV PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-*src.zip +ENV JUPYTER_RUNTIME_DIR=/home/spark/.local/share/jupyter/runtime diff --git a/docker/jupyter-entrypoint.sh b/docker/jupyter-entrypoint.sh new file mode 100644 index 00000000..92ffcef6 --- /dev/null +++ b/docker/jupyter-entrypoint.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +cd /home/spark/workspace + +exec jupyter notebook \ + --ip=0.0.0.0 \ + --port=8888 \ + --no-browser \ + --NotebookApp.token='' \ + --NotebookApp.password='' \ + --NotebookApp.notebook_dir=/home/spark/workspace diff --git a/example/jupyter/notebooks/jupyter_armada_spark.ipynb b/example/jupyter/notebooks/jupyter_armada_spark.ipynb new file mode 100644 index 00000000..0aa15007 --- /dev/null +++ b/example/jupyter/notebooks/jupyter_armada_spark.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "introduction", + "metadata": {}, + "source": [ + "# Armada Spark Example\n", + "\n", + "This notebook demonstrates how to run Spark jobs on Armada using PySpark in client mode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "imports", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import glob\n", + "import subprocess\n", + "import random\n", + "from pyspark.sql import SparkSession\n", + "from pyspark import SparkConf" + ] + }, + { + "cell_type": "markdown", + "id": "setup-section", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Clean up any existing Spark context and configure the environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "stop-existing-context", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " from pyspark import SparkContext\n", + " if SparkContext._active_spark_context:\n", + " SparkContext._active_spark_context.stop()\n", + "except:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "config-section", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Set up connection parameters and locate the Armada Spark JAR file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "configuration", + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "auth_token = os.environ.get('ARMADA_AUTH_TOKEN')\n", + "auth_script_path = os.environ.get('ARMADA_AUTH_SCRIPT_PATH')\n", + "driver_host = os.environ.get('SPARK_DRIVER_HOST')\n", + "driver_port = os.environ.get('SPARK_DRIVER_PORT', '7078')\n", + "block_manager_port = os.environ.get('SPARK_BLOCK_MANAGER_PORT', '10061')\n", + "armada_master = os.environ.get('ARMADA_MASTER', 'local://armada://host.docker.internal:30002')\n", + "armada_queue = os.environ.get('ARMADA_QUEUE', 'default')\n", + "armada_namespace = os.environ.get('ARMADA_NAMESPACE', 'default')\n", + "image_name = os.environ.get('IMAGE_NAME', 'spark:armada')\n", + "event_watcher_use_tls = os.environ.get('ARMADA_EVENT_WATCHER_USE_TLS', 'false')\n", + "\n", + "# Find JAR - try common Scala versions (2.12, 2.13)\n", + "jar_paths = glob.glob('/opt/spark/jars/armada-cluster-manager_2.1*-*-all.jar')\n", + "if not jar_paths:\n", + " raise FileNotFoundError(\"Armada Spark JAR not found!\")\n", + "armada_jar = jar_paths[0]\n", + "\n", + "# Generate app ID, required for client mode\n", + "app_id = f\"jupyter-spark-{subprocess.check_output(['openssl', 'rand', '-hex', '3']).decode().strip()}\"" + ] + }, + { + "cell_type": "markdown", + "id": "spark-config-section", + "metadata": {}, + "source": [ + "## Spark Configuration\n", + "\n", + "Configure Spark to use Armada as the cluster manager in client mode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "spark-config", + "metadata": {}, + "outputs": [], + "source": [ + "# Spark Configuration\n", + "conf = SparkConf()\n", + "if auth_token:\n", + " conf.set(\"spark.armada.auth.token\", auth_token)\n", + "if auth_script_path:\n", + " conf.set(\"spark.armada.auth.script.path\", auth_script_path)\n", + "if not driver_host:\n", + " raise ValueError(\n", + " \"SPARK_DRIVER_HOST environment variable is required. \"\n", + " )\n", + "conf.set(\"spark.master\", armada_master)\n", + "conf.set(\"spark.submit.deployMode\", \"client\")\n", + "conf.set(\"spark.app.id\", app_id)\n", + "conf.set(\"spark.app.name\", \"jupyter-spark-pi\")\n", + "conf.set(\"spark.driver.bindAddress\", \"0.0.0.0\")\n", + "conf.set(\"spark.driver.host\", driver_host)\n", + "conf.set(\"spark.driver.port\", driver_port)\n", + "conf.set(\"spark.driver.blockManager.port\", block_manager_port)\n", + "conf.set(\"spark.home\", \"/opt/spark\")\n", + "conf.set(\"spark.armada.container.image\", image_name)\n", + "conf.set(\"spark.armada.queue\", armada_queue)\n", + "conf.set(\"spark.armada.scheduling.namespace\", armada_namespace)\n", + "conf.set(\"spark.armada.eventWatcher.useTls\", event_watcher_use_tls)\n", + "conf.set(\"spark.kubernetes.file.upload.path\", \"/tmp\")\n", + "conf.set(\"spark.kubernetes.executor.disableConfigMap\", \"true\")\n", + "conf.set(\"spark.local.dir\", \"/tmp\")\n", + "conf.set(\"spark.jars\", armada_jar)\n", + "\n", + "# Network timeouts\n", + "conf.set(\"spark.network.timeout\", \"800s\")\n", + "conf.set(\"spark.executor.heartbeatInterval\", \"60s\")\n", + "\n", + "# Static mode - tune these values for your environment\n", + "conf.set(\"spark.executor.instances\", \"2\")\n", + "conf.set(\"spark.armada.driver.limit.memory\", \"1Gi\")\n", + "conf.set(\"spark.armada.driver.request.memory\", \"1Gi\")\n", + "conf.set(\"spark.armada.executor.limit.memory\", \"1Gi\")\n", + "conf.set(\"spark.armada.executor.request.memory\", \"1Gi\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "create-spark-session", + "metadata": {}, + "outputs": [], + "source": [ + "# Create SparkSession\n", + "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", + "print(f\"SparkSession created\")" + ] + }, + { + "cell_type": "markdown", + "id": "examples-section", + "metadata": {}, + "source": [ + "## Examples\n", + "\n", + "Run Spark computations on the Armada cluster." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "spark-pi-calculation", + "metadata": {}, + "outputs": [], + "source": [ + "# Spark Pi calculation\n", + "print(f\"Running Spark Pi calculation...\")\n", + "n = 10000\n", + "\n", + "def inside(p):\n", + " x, y = random.random(), random.random()\n", + " return x*x + y*y < 1\n", + "\n", + "count = spark.sparkContext.parallelize(range(0, n)).filter(inside).count()\n", + "pi = 4.0 * count / n\n", + "print(f\" Pi is approximately: {pi}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cleanup-section", + "metadata": {}, + "source": [ + "## Cleanup\n", + "\n", + "Stop the Spark context to release resources. This will stop the executors in Armada." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "stop-spark-context", + "metadata": {}, + "outputs": [], + "source": [ + "# Stop Spark context\n", + "print(\"Stopping Spark context...\")\n", + "spark.stop()\n", + "print(\"Spark context stopped successfully\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/scripts/createImage.sh b/scripts/createImage.sh index 8f386145..635d2927 100755 --- a/scripts/createImage.sh +++ b/scripts/createImage.sh @@ -59,6 +59,7 @@ docker build \ --build-arg spark_base_image_prefix=$image_prefix \ --build-arg spark_base_image_tag=$image_tag \ --build-arg scala_binary_version=$SCALA_BIN_VERSION \ + --build-arg spark_version=$SPARK_VERSION \ --build-arg include_python=$INCLUDE_PYTHON \ -f "$root/docker/Dockerfile" \ "$root" diff --git a/scripts/runJupyter.sh b/scripts/runJupyter.sh new file mode 100755 index 00000000..d3a27e3b --- /dev/null +++ b/scripts/runJupyter.sh @@ -0,0 +1,94 @@ +#!/bin/bash +set -euo pipefail + +# init environment variables +scripts="$(cd "$(dirname "$0")"; pwd)" +source "$scripts/init.sh" + +# Jupyter-specific defaults +JUPYTER_PORT="${JUPYTER_PORT:-8888}" +SPARK_BLOCK_MANAGER_PORT="${SPARK_BLOCK_MANAGER_PORT:-10061}" +SPARK_DRIVER_PORT="${SPARK_DRIVER_PORT:-7078}" + +# SPARK_DRIVER_HOST is required - must be reachable from Kubernetes executors +if [ -z "${SPARK_DRIVER_HOST:-}" ]; then + echo "Error: SPARK_DRIVER_HOST must be set." + echo "" + exit 1 +fi + +if [ "${USE_KIND}" == "true" ]; then + # Ensure queue exists on Armada + if ! armadactl get queue $ARMADA_QUEUE >& /dev/null; then + armadactl create queue $ARMADA_QUEUE + fi + + # needed by kind load docker-image (if docker is installed via snap) + # https://github.com/kubernetes-sigs/kind/issues/2535 + export TMPDIR="$scripts/.tmp" + mkdir -p "$TMPDIR" + kind load docker-image $IMAGE_NAME --name armada +fi + +# Setup workspace directory +root="$(cd "$scripts/.."; pwd)" +notebooks_dir="$root/example/jupyter/notebooks" +workspace_dir="$root/example/jupyter/workspace" + +# Create workspace directory if it doesn't exist +mkdir -p "$workspace_dir" + +# Copy example notebooks to workspace only if they don't already exist +if [ -d "$notebooks_dir" ]; then + for notebook in "$notebooks_dir"/*.ipynb; do + [ -f "$notebook" ] || break + notebook_name=$(basename "$notebook") + if [ ! -f "$workspace_dir/$notebook_name" ]; then + echo "Copying $notebook_name to workspace..." + cp "$notebook" "$workspace_dir/" + fi + done +fi + +# Remove existing container if it exists +if docker ps -a --format '{{.Names}}' | grep -q "^armada-jupyter$"; then + echo "Removing existing armada-jupyter container..." + docker rm -f armada-jupyter >/dev/null 2>&1 || true +fi + +# Run Jupyter container +docker run -d \ + --name armada-jupyter \ + -p ${JUPYTER_PORT}:8888 \ + -p ${SPARK_BLOCK_MANAGER_PORT}:${SPARK_BLOCK_MANAGER_PORT} \ + -p ${SPARK_DRIVER_PORT}:${SPARK_DRIVER_PORT} \ + -e SPARK_DRIVER_HOST=${SPARK_DRIVER_HOST} \ + -e SPARK_DRIVER_PORT=${SPARK_DRIVER_PORT} \ + -e SPARK_BLOCK_MANAGER_PORT=${SPARK_BLOCK_MANAGER_PORT} \ + -e ARMADA_MASTER=${ARMADA_MASTER} \ + -e ARMADA_QUEUE=${ARMADA_QUEUE} \ + -e ARMADA_NAMESPACE=${ARMADA_NAMESPACE:-default} \ + -e IMAGE_NAME=${IMAGE_NAME} \ + -e ARMADA_AUTH_TOKEN=${ARMADA_AUTH_TOKEN:-} \ + -e ARMADA_AUTH_SCRIPT_PATH=${ARMADA_AUTH_SCRIPT_PATH:-} \ + -e ARMADA_EVENT_WATCHER_USE_TLS=${ARMADA_EVENT_WATCHER_USE_TLS:-false} \ + -v "$workspace_dir:/home/spark/workspace" \ + -v "$root/conf:/opt/spark/conf:ro" \ + --rm \ + ${IMAGE_NAME} \ + /opt/spark/bin/jupyter-entrypoint.sh + +# Wait for Jupyter server to be reachable +for i in {1..10}; do + if curl -s -f -o /dev/null "http://localhost:${JUPYTER_PORT}" 2>/dev/null; then + echo "Jupyter notebook is running at http://localhost:${JUPYTER_PORT}" + echo "Workspace is available in the container at /home/spark/workspace" + echo "Notebooks are persisted in $workspace_dir" + exit 0 + fi + sleep 1 +done + +echo "Error: Jupyter server is not reachable. The container may have exited." +echo "This likely means Python/Jupyter is not installed in the image (INCLUDE_PYTHON=false)." +exit 1