pytorch · larryliu0820 · Nov 4, 2025 · Oct 31, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-467660923a5a25e4718e1d6697b93ff1bab4e807
+4361747abfc55e40e929396ed986efe775d745f9
diff --git a/.ci/scripts/export_model_cuda_artifact.sh b/.ci/scripts/export_model_cuda_artifact.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Export model to CUDA format with optional quantization
+
+show_help() {
+  cat << EOF
+Usage: export_model_cuda_artifact.sh <hf_model> [quant_name] [output_dir]
+
+Export a HuggingFace model to CUDA format with optional quantization.
+
+Arguments:
+  hf_model     HuggingFace model ID (required)
+               Supported models:
+                 - mistralai/Voxtral-Mini-3B-2507
+                 - openai/whisper-small
+                 - google/gemma-3-4b-it
+
+  quant_name   Quantization type (optional, default: non-quantized)
+               Options:
+                 - non-quantized
+                 - quantized-int4-tile-packed
+                 - quantized-int4-weight-only
+
+  output_dir   Output directory for artifacts (optional, default: current directory)
+
+Examples:
+  export_model_cuda_artifact.sh "openai/whisper-small"
+  export_model_cuda_artifact.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
+  export_model_cuda_artifact.sh "google/gemma-3-4b-it" "non-quantized" "./output"
+EOF
+}
+
+if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
+  show_help
+  exit 0
+fi
+
+if [ -z "${1:-}" ]; then
+  echo "Error: hf_model argument is required"
+  echo "Run with -h or --help for usage information"
+  exit 1
+fi
+
+set -eux
+
+HF_MODEL="$1"
+QUANT_NAME="${2:-non-quantized}"
+OUTPUT_DIR="${3:-.}"
+
+# Determine model configuration based on HF model ID
+case "$HF_MODEL" in
+  mistralai/Voxtral-Mini-3B-2507)
+    MODEL_NAME="voxtral"
+    TASK="multimodal-text-to-text"
+    MAX_SEQ_LEN="1024"
+    EXTRA_PIP="mistral-common librosa"
+    PREPROCESSOR_FEATURE_SIZE="128"
+    PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte"
+    ;;
+  openai/whisper-small)
+    MODEL_NAME="whisper"
+    TASK="automatic-speech-recognition"
+    MAX_SEQ_LEN=""
+    EXTRA_PIP="librosa"
+    PREPROCESSOR_FEATURE_SIZE="80"
+    PREPROCESSOR_OUTPUT="whisper_preprocessor.pte"
+    ;;
+  google/gemma-3-4b-it)
+    MODEL_NAME="gemma3"
+    TASK="multimodal-text-to-text"
+    MAX_SEQ_LEN="64"
+    EXTRA_PIP=""
+    PREPROCESSOR_FEATURE_SIZE=""
+    PREPROCESSOR_OUTPUT=""
+    ;;
+  *)
+    echo "Error: Unsupported model '$HF_MODEL'"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
+    exit 1
+    ;;
+esac
+
+# Determine quantization args based on quant name
+case "$QUANT_NAME" in
+  non-quantized)
+    EXTRA_ARGS=""
+    ;;
+  quantized-int4-tile-packed)
+    EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+    ;;
+  quantized-int4-weight-only)
+    EXTRA_ARGS="--qlinear_encoder 4w"
+    ;;
+  *)
+    echo "Error: Unsupported quantization '$QUANT_NAME'"
+    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only"
+    exit 1
+    ;;
+esac
+
+echo "::group::Export $MODEL_NAME"
+
+if [ -n "$EXTRA_PIP" ]; then
+  pip install $EXTRA_PIP
+fi
+pip list
+
+MAX_SEQ_LEN_ARG=""
+if [ -n "$MAX_SEQ_LEN" ]; then
+  MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
+fi
+optimum-cli export executorch \
+    --model "$HF_MODEL" \
+    --task "$TASK" \
+    --recipe "cuda" \
+    --dtype bfloat16 \
+    --device cuda \
+    ${MAX_SEQ_LEN_ARG} \
+    ${EXTRA_ARGS} \
+    --output_dir ./
+
+if [ -n "$PREPROCESSOR_OUTPUT" ]; then
+  python -m executorch.extension.audio.mel_spectrogram \
+      --feature_size $PREPROCESSOR_FEATURE_SIZE \
+      --stack_output \
+      --max_audio_len 300 \
+      --output_file $PREPROCESSOR_OUTPUT
+fi
+
+test -f model.pte
+test -f aoti_cuda_blob.ptd
+if [ -n "$PREPROCESSOR_OUTPUT" ]; then
+  test -f $PREPROCESSOR_OUTPUT
+fi
+echo "::endgroup::"
+
+echo "::group::Store $MODEL_NAME Artifacts"
+mkdir -p "${OUTPUT_DIR}"
+cp model.pte "${OUTPUT_DIR}/"
+cp aoti_cuda_blob.ptd "${OUTPUT_DIR}/"
+if [ -n "$PREPROCESSOR_OUTPUT" ]; then
+  cp $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
+fi
+ls -al "${OUTPUT_DIR}"
+echo "::endgroup::"
diff --git a/.ci/scripts/test_model_cuda_e2e.sh b/.ci/scripts/test_model_cuda_e2e.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Test CUDA model end-to-end, need to run .ci/scripts/export_model_cuda_artifact.sh first
+
+show_help() {
+  cat << EOF
+Usage: test_model_cuda_e2e.sh <hf_model> <quant_name> [model_dir]
+
+Build and run end-to-end tests for CUDA models.
+
+Arguments:
+  hf_model    HuggingFace model ID (required)
+              Supported models:
+                - mistralai/Voxtral-Mini-3B-2507
+                - openai/whisper-small
+                - google/gemma-3-4b-it
+
+  quant_name  Quantization type (required)
+              Options:
+                - non-quantized
+                - quantized-int4-tile-packed
+                - quantized-int4-weight-only
+
+  model_dir   Directory containing model artifacts (optional, default: current directory)
+              Expected files: model.pte, aoti_cuda_blob.ptd
+              Tokenizers and test files will be downloaded to this directory
+
+Examples:
+  test_model_cuda_e2e.sh "openai/whisper-small" "non-quantized"
+  test_model_cuda_e2e.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
+EOF
+}
+
+if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
+  show_help
+  exit 0
+fi
+
+if [ -z "${1:-}" ]; then
+  echo "Error: hf_model argument is required"
+  echo "Run with -h or --help for usage information"
+  exit 1
+fi
+
+if [ -z "${2:-}" ]; then
+  echo "Error: quant_name argument is required"
+  echo "Run with -h or --help for usage information"
+  exit 1
+fi
+
+set -eux
+
+HF_MODEL="$1"
+QUANT_NAME="$2"
+# Download tokenizers, audio, and image files to this directory
+MODEL_DIR="${3:-.}"
+
+echo "Testing model: $HF_MODEL (quantization: $QUANT_NAME)"
+
+# Make sure model.pte and aoti_cuda_blob.ptd exist
+if [ ! -f "$MODEL_DIR/model.pte" ]; then
+  echo "Error: model.pte not found in $MODEL_DIR"
+  exit 1
+fi
+if [ ! -f "$MODEL_DIR/aoti_cuda_blob.ptd" ]; then
+  echo "Error: aoti_cuda_blob.ptd not found in $MODEL_DIR"
+  exit 1
+fi
+# Locate EXECUTORCH_ROOT from the directory of this script
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+EXECUTORCH_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+pushd "$EXECUTORCH_ROOT"
+
+# Determine model configuration based on HF model ID
+case "$HF_MODEL" in
+  mistralai/Voxtral-Mini-3B-2507)
+    MODEL_NAME="voxtral"
+    RUNNER_TARGET="voxtral_runner"
+    RUNNER_PATH="voxtral"
+    EXPECTED_OUTPUT="poem"
+    PREPROCESSOR="voxtral_preprocessor.pte"
+    TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main" # @lint-ignore
+    TOKENIZER_FILE="tekken.json"
+    AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+    AUDIO_FILE="poem.wav"
+    IMAGE_PATH=""
+    ;;
+  openai/whisper-small)
+    MODEL_NAME="whisper"
+    RUNNER_TARGET="whisper_runner"
+    RUNNER_PATH="whisper"
+    EXPECTED_OUTPUT="Mr. Quilter"
+    PREPROCESSOR="whisper_preprocessor.pte"
+    TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore
+    TOKENIZER_FILE=""
+    AUDIO_URL=""
+    AUDIO_FILE="output.wav"
+    IMAGE_PATH=""
+    ;;
+  google/gemma-3-4b-it)
+    MODEL_NAME="gemma3"
+    RUNNER_TARGET="gemma3_e2e_runner"
+    RUNNER_PATH="gemma3"
+    EXPECTED_OUTPUT="chip"
+    PREPROCESSOR=""
+    TOKENIZER_URL="https://huggingface.co/google/gemma-3-4b-it/resolve/main" # @lint-ignore
+    TOKENIZER_FILE=""
+    AUDIO_URL=""
+    AUDIO_FILE=""
+    IMAGE_PATH="docs/source/_static/img/et-logo.png"
+    ;;
+  *)
+    echo "Error: Unsupported model '$HF_MODEL'"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
+    exit 1
+    ;;
+esac
+
+echo "::group::Setup ExecuTorch Requirements"
+./install_requirements.sh
+pip list
+echo "::endgroup::"
+
+echo "::group::Prepare $MODEL_NAME Artifacts"
+
+
+# Download tokenizer files
+if [ "$TOKENIZER_FILE" != "" ]; then
+  curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
+else
+  curl -L $TOKENIZER_URL/tokenizer.json -o $MODEL_DIR/tokenizer.json
+  curl -L $TOKENIZER_URL/tokenizer_config.json -o $MODEL_DIR/tokenizer_config.json
+  curl -L $TOKENIZER_URL/special_tokens_map.json -o $MODEL_DIR/special_tokens_map.json
+fi
+
+# Download test files
+if [ "$AUDIO_URL" != "" ]; then
+  curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
+elif [ "$MODEL_NAME" = "whisper" ]; then
+  python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
+fi
+
+ls -al
+echo "::endgroup::"
+
+echo "::group::Build $MODEL_NAME Runner"
+cmake --preset llm \
+      -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_INSTALL_PREFIX=cmake-out \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Bcmake-out -S.
+cmake --build cmake-out -j$(nproc) --target install --config Release
+
+cmake -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Sexamples/models/$RUNNER_PATH \
+      -Bcmake-out/examples/models/$RUNNER_PATH/
+cmake --build cmake-out/examples/models/$RUNNER_PATH --target $RUNNER_TARGET --config Release
+echo "::endgroup::"
+
+echo "::group::Run $MODEL_NAME Runner"
+set +e
+export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+
+# Build runner command with common arguments
+RUNNER_BIN="cmake-out/examples/models/$RUNNER_PATH/$RUNNER_TARGET"
+RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd --temperature 0"
+
+# Add model-specific arguments
+case "$MODEL_NAME" in
+  voxtral)
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
+    ;;
+  whisper)
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
+    ;;
+  gemma3)
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
+    ;;
+esac
+
+OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)
+EXIT_CODE=$?
+set -e
+
+if ! echo "$OUTPUT" | grep -iq "$EXPECTED_OUTPUT"; then
+  echo "Expected output '$EXPECTED_OUTPUT' not found in output"
+  exit 1
+elif
+  echo "Success: '$EXPECTED_OUTPUT' found in output"
+fi
+
+if [ $EXIT_CODE -ne 0 ]; then
+  echo "Unexpected exit code: $EXIT_CODE"
+  exit $EXIT_CODE
+fi
+echo "::endgroup::"
+
+popd
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		467660923a5a25e4718e1d6697b93ff1bab4e807
		4361747abfc55e40e929396ed986efe775d745f9