Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/optimum-executorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
467660923a5a25e4718e1d6697b93ff1bab4e807
4361747abfc55e40e929396ed986efe775d745f9
150 changes: 150 additions & 0 deletions .ci/scripts/export_model_cuda_artifact.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Export model to CUDA format with optional quantization

show_help() {
cat << EOF
Usage: export_model_cuda_artifact.sh <hf_model> [quant_name] [output_dir]

Export a HuggingFace model to CUDA format with optional quantization.

Arguments:
hf_model HuggingFace model ID (required)
Supported models:
- mistralai/Voxtral-Mini-3B-2507
- openai/whisper-small
- google/gemma-3-4b-it

quant_name Quantization type (optional, default: non-quantized)
Options:
- non-quantized
- quantized-int4-tile-packed
- quantized-int4-weight-only

output_dir Output directory for artifacts (optional, default: current directory)

Examples:
export_model_cuda_artifact.sh "openai/whisper-small"
export_model_cuda_artifact.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
export_model_cuda_artifact.sh "google/gemma-3-4b-it" "non-quantized" "./output"
EOF
}

if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
show_help
exit 0
fi

if [ -z "${1:-}" ]; then
echo "Error: hf_model argument is required"
echo "Run with -h or --help for usage information"
exit 1
fi

set -eux

HF_MODEL="$1"
QUANT_NAME="${2:-non-quantized}"
OUTPUT_DIR="${3:-.}"

# Determine model configuration based on HF model ID
case "$HF_MODEL" in
mistralai/Voxtral-Mini-3B-2507)
MODEL_NAME="voxtral"
TASK="multimodal-text-to-text"
MAX_SEQ_LEN="1024"
EXTRA_PIP="mistral-common librosa"
PREPROCESSOR_FEATURE_SIZE="128"
PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte"
;;
openai/whisper-small)
MODEL_NAME="whisper"
TASK="automatic-speech-recognition"
MAX_SEQ_LEN=""
EXTRA_PIP="librosa"
PREPROCESSOR_FEATURE_SIZE="80"
PREPROCESSOR_OUTPUT="whisper_preprocessor.pte"
;;
google/gemma-3-4b-it)
MODEL_NAME="gemma3"
TASK="multimodal-text-to-text"
MAX_SEQ_LEN="64"
EXTRA_PIP=""
PREPROCESSOR_FEATURE_SIZE=""
PREPROCESSOR_OUTPUT=""
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
exit 1
;;
esac

# Determine quantization args based on quant name
case "$QUANT_NAME" in
non-quantized)
EXTRA_ARGS=""
;;
quantized-int4-tile-packed)
EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
;;
quantized-int4-weight-only)
EXTRA_ARGS="--qlinear_encoder 4w"
;;
*)
echo "Error: Unsupported quantization '$QUANT_NAME'"
echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only"
exit 1
;;
esac

echo "::group::Export $MODEL_NAME"

if [ -n "$EXTRA_PIP" ]; then
pip install $EXTRA_PIP
fi
pip list

MAX_SEQ_LEN_ARG=""
if [ -n "$MAX_SEQ_LEN" ]; then
MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
fi
optimum-cli export executorch \
--model "$HF_MODEL" \
--task "$TASK" \
--recipe "cuda" \
--dtype bfloat16 \
--device cuda \
${MAX_SEQ_LEN_ARG} \
${EXTRA_ARGS} \
--output_dir ./

if [ -n "$PREPROCESSOR_OUTPUT" ]; then
python -m executorch.extension.audio.mel_spectrogram \
--feature_size $PREPROCESSOR_FEATURE_SIZE \
--stack_output \
--max_audio_len 300 \
--output_file $PREPROCESSOR_OUTPUT
fi

test -f model.pte
test -f aoti_cuda_blob.ptd
if [ -n "$PREPROCESSOR_OUTPUT" ]; then
test -f $PREPROCESSOR_OUTPUT
fi
echo "::endgroup::"

echo "::group::Store $MODEL_NAME Artifacts"
mkdir -p "${OUTPUT_DIR}"
cp model.pte "${OUTPUT_DIR}/"
cp aoti_cuda_blob.ptd "${OUTPUT_DIR}/"
if [ -n "$PREPROCESSOR_OUTPUT" ]; then
cp $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
fi
ls -al "${OUTPUT_DIR}"
echo "::endgroup::"
205 changes: 205 additions & 0 deletions .ci/scripts/test_model_cuda_e2e.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Test CUDA model end-to-end, need to run .ci/scripts/export_model_cuda_artifact.sh first

show_help() {
cat << EOF
Usage: test_model_cuda_e2e.sh <hf_model> <quant_name> [model_dir]

Build and run end-to-end tests for CUDA models.

Arguments:
hf_model HuggingFace model ID (required)
Supported models:
- mistralai/Voxtral-Mini-3B-2507
- openai/whisper-small
- google/gemma-3-4b-it

quant_name Quantization type (required)
Options:
- non-quantized
- quantized-int4-tile-packed
- quantized-int4-weight-only

model_dir Directory containing model artifacts (optional, default: current directory)
Expected files: model.pte, aoti_cuda_blob.ptd
Tokenizers and test files will be downloaded to this directory

Examples:
test_model_cuda_e2e.sh "openai/whisper-small" "non-quantized"
test_model_cuda_e2e.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
EOF
}

if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
show_help
exit 0
fi

if [ -z "${1:-}" ]; then
echo "Error: hf_model argument is required"
echo "Run with -h or --help for usage information"
exit 1
fi

if [ -z "${2:-}" ]; then
echo "Error: quant_name argument is required"
echo "Run with -h or --help for usage information"
exit 1
fi

set -eux

HF_MODEL="$1"
QUANT_NAME="$2"
# Download tokenizers, audio, and image files to this directory
MODEL_DIR="${3:-.}"

echo "Testing model: $HF_MODEL (quantization: $QUANT_NAME)"

# Make sure model.pte and aoti_cuda_blob.ptd exist
if [ ! -f "$MODEL_DIR/model.pte" ]; then
echo "Error: model.pte not found in $MODEL_DIR"
exit 1
fi
if [ ! -f "$MODEL_DIR/aoti_cuda_blob.ptd" ]; then
echo "Error: aoti_cuda_blob.ptd not found in $MODEL_DIR"
exit 1
fi
# Locate EXECUTORCH_ROOT from the directory of this script
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
EXECUTORCH_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"

pushd "$EXECUTORCH_ROOT"

# Determine model configuration based on HF model ID
case "$HF_MODEL" in
mistralai/Voxtral-Mini-3B-2507)
MODEL_NAME="voxtral"
RUNNER_TARGET="voxtral_runner"
RUNNER_PATH="voxtral"
EXPECTED_OUTPUT="poem"
PREPROCESSOR="voxtral_preprocessor.pte"
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main" # @lint-ignore
TOKENIZER_FILE="tekken.json"
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
AUDIO_FILE="poem.wav"
IMAGE_PATH=""
;;
openai/whisper-small)
MODEL_NAME="whisper"
RUNNER_TARGET="whisper_runner"
RUNNER_PATH="whisper"
EXPECTED_OUTPUT="Mr. Quilter"
PREPROCESSOR="whisper_preprocessor.pte"
TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore
TOKENIZER_FILE=""
AUDIO_URL=""
AUDIO_FILE="output.wav"
IMAGE_PATH=""
;;
google/gemma-3-4b-it)
MODEL_NAME="gemma3"
RUNNER_TARGET="gemma3_e2e_runner"
RUNNER_PATH="gemma3"
EXPECTED_OUTPUT="chip"
PREPROCESSOR=""
TOKENIZER_URL="https://huggingface.co/google/gemma-3-4b-it/resolve/main" # @lint-ignore
TOKENIZER_FILE=""
AUDIO_URL=""
AUDIO_FILE=""
IMAGE_PATH="docs/source/_static/img/et-logo.png"
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
exit 1
;;
esac

echo "::group::Setup ExecuTorch Requirements"
./install_requirements.sh
pip list
echo "::endgroup::"

echo "::group::Prepare $MODEL_NAME Artifacts"


# Download tokenizer files
if [ "$TOKENIZER_FILE" != "" ]; then
curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
else
curl -L $TOKENIZER_URL/tokenizer.json -o $MODEL_DIR/tokenizer.json
curl -L $TOKENIZER_URL/tokenizer_config.json -o $MODEL_DIR/tokenizer_config.json
curl -L $TOKENIZER_URL/special_tokens_map.json -o $MODEL_DIR/special_tokens_map.json
fi

# Download test files
if [ "$AUDIO_URL" != "" ]; then
curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
elif [ "$MODEL_NAME" = "whisper" ]; then
python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
fi

ls -al
echo "::endgroup::"

echo "::group::Build $MODEL_NAME Runner"
cmake --preset llm \
-DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-Bcmake-out -S.
cmake --build cmake-out -j$(nproc) --target install --config Release

cmake -DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_BUILD_TYPE=Release \
-Sexamples/models/$RUNNER_PATH \
-Bcmake-out/examples/models/$RUNNER_PATH/
cmake --build cmake-out/examples/models/$RUNNER_PATH --target $RUNNER_TARGET --config Release
echo "::endgroup::"

echo "::group::Run $MODEL_NAME Runner"
set +e
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH

# Build runner command with common arguments
RUNNER_BIN="cmake-out/examples/models/$RUNNER_PATH/$RUNNER_TARGET"
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd --temperature 0"

# Add model-specific arguments
case "$MODEL_NAME" in
voxtral)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
;;
whisper)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
;;
gemma3)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
;;
esac

OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)
EXIT_CODE=$?
set -e

if ! echo "$OUTPUT" | grep -iq "$EXPECTED_OUTPUT"; then
echo "Expected output '$EXPECTED_OUTPUT' not found in output"
exit 1
elif
echo "Success: '$EXPECTED_OUTPUT' found in output"
fi

if [ $EXIT_CODE -ne 0 ]; then
echo "Unexpected exit code: $EXIT_CODE"
exit $EXIT_CODE
fi
echo "::endgroup::"

popd
Loading
Loading