Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/ci_commit_pins/optimum-executorch.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
467660923a5a25e4718e1d6697b93ff1bab4e807
4361747abfc55e40e929396ed986efe775d745f9
150 changes: 150 additions & 0 deletions .ci/scripts/export_model_cuda_artifact.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Export model to CUDA format with optional quantization

show_help() {
cat << EOF
Usage: export_model_cuda_artifact.sh <hf_model> [quant_name] [output_dir]

Export a HuggingFace model to CUDA format with optional quantization.

Arguments:
hf_model HuggingFace model ID (required)
Supported models:
- mistralai/Voxtral-Mini-3B-2507
- openai/whisper-small
- google/gemma-3-4b-it

quant_name Quantization type (optional, default: non-quantized)
Options:
- non-quantized
- quantized-int4-tile-packed
- quantized-int4-weight-only

output_dir Output directory for artifacts (optional, default: current directory)

Examples:
export_model_cuda_artifact.sh "openai/whisper-small"
export_model_cuda_artifact.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
export_model_cuda_artifact.sh "google/gemma-3-4b-it" "non-quantized" "./output"
EOF
}

if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
show_help
exit 0
fi

if [ -z "${1:-}" ]; then
echo "Error: hf_model argument is required"
echo "Run with -h or --help for usage information"
exit 1
fi

set -eux

HF_MODEL="$1"
QUANT_NAME="${2:-non-quantized}"
OUTPUT_DIR="${3:-.}"

# Determine model configuration based on HF model ID
case "$HF_MODEL" in
mistralai/Voxtral-Mini-3B-2507)
MODEL_NAME="voxtral"
TASK="multimodal-text-to-text"
MAX_SEQ_LEN="1024"
EXTRA_PIP="mistral-common librosa"
PREPROCESSOR_FEATURE_SIZE="128"
PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte"
;;
openai/whisper-small)
MODEL_NAME="whisper"
TASK="automatic-speech-recognition"
MAX_SEQ_LEN=""
EXTRA_PIP="librosa"
PREPROCESSOR_FEATURE_SIZE="80"
PREPROCESSOR_OUTPUT="whisper_preprocessor.pte"
;;
google/gemma-3-4b-it)
MODEL_NAME="gemma3"
TASK="multimodal-text-to-text"
MAX_SEQ_LEN="64"
EXTRA_PIP=""
PREPROCESSOR_FEATURE_SIZE=""
PREPROCESSOR_OUTPUT=""
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
exit 1
;;
esac

# Determine quantization args based on quant name
case "$QUANT_NAME" in
non-quantized)
EXTRA_ARGS=""
;;
quantized-int4-tile-packed)
EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
;;
quantized-int4-weight-only)
EXTRA_ARGS="--qlinear_encoder 4w"
;;
*)
echo "Error: Unsupported quantization '$QUANT_NAME'"
echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only"
exit 1
;;
esac

echo "::group::Export $MODEL_NAME"

if [ -n "$EXTRA_PIP" ]; then
pip install $EXTRA_PIP
fi
pip list

MAX_SEQ_LEN_ARG=""
if [ -n "$MAX_SEQ_LEN" ]; then
MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
fi
optimum-cli export executorch \
--model "$HF_MODEL" \
--task "$TASK" \
--recipe "cuda" \
--dtype bfloat16 \
--device cuda \
${MAX_SEQ_LEN_ARG} \
${EXTRA_ARGS} \
--output_dir ./

if [ -n "$PREPROCESSOR_OUTPUT" ]; then
python -m executorch.extension.audio.mel_spectrogram \
--feature_size $PREPROCESSOR_FEATURE_SIZE \
--stack_output \
--max_audio_len 300 \
--output_file $PREPROCESSOR_OUTPUT
fi

test -f model.pte
test -f aoti_cuda_blob.ptd
if [ -n "$PREPROCESSOR_OUTPUT" ]; then
test -f $PREPROCESSOR_OUTPUT
fi
echo "::endgroup::"

echo "::group::Store $MODEL_NAME Artifacts"
mkdir -p "${OUTPUT_DIR}"
cp model.pte "${OUTPUT_DIR}/"
cp aoti_cuda_blob.ptd "${OUTPUT_DIR}/"
if [ -n "$PREPROCESSOR_OUTPUT" ]; then
cp $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
fi
ls -al "${OUTPUT_DIR}"
echo "::endgroup::"
205 changes: 205 additions & 0 deletions .ci/scripts/test_model_cuda_e2e.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Test CUDA model end-to-end, need to run .ci/scripts/export_model_cuda_artifact.sh first

show_help() {
cat << EOF
Usage: test_model_cuda_e2e.sh <hf_model> <quant_name> [model_dir]

Build and run end-to-end tests for CUDA models.

Arguments:
hf_model HuggingFace model ID (required)
Supported models:
- mistralai/Voxtral-Mini-3B-2507
- openai/whisper-small
- google/gemma-3-4b-it

quant_name Quantization type (required)
Options:
- non-quantized
- quantized-int4-tile-packed
- quantized-int4-weight-only

model_dir Directory containing model artifacts (optional, default: current directory)
Expected files: model.pte, aoti_cuda_blob.ptd
Tokenizers and test files will be downloaded to this directory

Examples:
test_model_cuda_e2e.sh "openai/whisper-small" "non-quantized"
test_model_cuda_e2e.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
EOF
}

if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
show_help
exit 0
fi

if [ -z "${1:-}" ]; then
echo "Error: hf_model argument is required"
echo "Run with -h or --help for usage information"
exit 1
fi

if [ -z "${2:-}" ]; then
echo "Error: quant_name argument is required"
echo "Run with -h or --help for usage information"
exit 1
fi

set -eux

HF_MODEL="$1"
QUANT_NAME="$2"
# Download tokenizers, audio, and image files to this directory
MODEL_DIR="${3:-.}"

echo "Testing model: $HF_MODEL (quantization: $QUANT_NAME)"

# Make sure model.pte and aoti_cuda_blob.ptd exist
if [ ! -f "$MODEL_DIR/model.pte" ]; then
echo "Error: model.pte not found in $MODEL_DIR"
exit 1
fi
if [ ! -f "$MODEL_DIR/aoti_cuda_blob.ptd" ]; then
echo "Error: aoti_cuda_blob.ptd not found in $MODEL_DIR"
exit 1
fi
# Locate EXECUTORCH_ROOT from the directory of this script
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
EXECUTORCH_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"

pushd "$EXECUTORCH_ROOT"

# Determine model configuration based on HF model ID
case "$HF_MODEL" in
mistralai/Voxtral-Mini-3B-2507)
MODEL_NAME="voxtral"
RUNNER_TARGET="voxtral_runner"
RUNNER_PATH="voxtral"
EXPECTED_OUTPUT="poem"
PREPROCESSOR="voxtral_preprocessor.pte"
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main" # @lint-ignore
TOKENIZER_FILE="tekken.json"
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
AUDIO_FILE="poem.wav"
IMAGE_PATH=""
;;
openai/whisper-small)
MODEL_NAME="whisper"
RUNNER_TARGET="whisper_runner"
RUNNER_PATH="whisper"
EXPECTED_OUTPUT="Mr. Quilter is the apostle of the middle classes"
PREPROCESSOR="whisper_preprocessor.pte"
TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore
TOKENIZER_FILE=""
AUDIO_URL=""
AUDIO_FILE="output.wav"
IMAGE_PATH=""
;;
google/gemma-3-4b-it)
MODEL_NAME="gemma3"
RUNNER_TARGET="gemma3_e2e_runner"
RUNNER_PATH="gemma3"
EXPECTED_OUTPUT="chip"
PREPROCESSOR=""
TOKENIZER_URL="https://huggingface.co/google/gemma-3-4b-it/resolve/main" # @lint-ignore
TOKENIZER_FILE=""
AUDIO_URL=""
AUDIO_FILE=""
IMAGE_PATH="docs/source/_static/img/et-logo.png"
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
exit 1
;;
esac

echo "::group::Setup ExecuTorch Requirements"
./install_requirements.sh
pip list
echo "::endgroup::"

echo "::group::Prepare $MODEL_NAME Artifacts"


# Download tokenizer files
if [ "$TOKENIZER_FILE" != "" ]; then
curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
else
curl -L $TOKENIZER_URL/tokenizer.json -o $MODEL_DIR/tokenizer.json
curl -L $TOKENIZER_URL/tokenizer_config.json -o $MODEL_DIR/tokenizer_config.json
curl -L $TOKENIZER_URL/special_tokens_map.json -o $MODEL_DIR/special_tokens_map.json
fi

# Download test files
if [ "$AUDIO_URL" != "" ]; then
curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
elif [ "$MODEL_NAME" = "whisper" ]; then
python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
fi

ls -al
echo "::endgroup::"

echo "::group::Build $MODEL_NAME Runner"
cmake --preset llm \
-DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Release \
-Bcmake-out -S.
cmake --build cmake-out -j$(nproc) --target install --config Release

cmake -DEXECUTORCH_BUILD_CUDA=ON \
-DCMAKE_BUILD_TYPE=Release \
-Sexamples/models/$RUNNER_PATH \
-Bcmake-out/examples/models/$RUNNER_PATH/
cmake --build cmake-out/examples/models/$RUNNER_PATH --target $RUNNER_TARGET --config Release
echo "::endgroup::"

echo "::group::Run $MODEL_NAME Runner"
set +e
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH

# Build runner command with common arguments
RUNNER_BIN="cmake-out/examples/models/$RUNNER_PATH/$RUNNER_TARGET"
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd --temperature 0"

# Add model-specific arguments
case "$MODEL_NAME" in
voxtral)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
;;
whisper)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
;;
gemma3)
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
;;
esac

OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)
EXIT_CODE=$?
set -e

if ! echo "$OUTPUT" | grep -iq "$EXPECTED_OUTPUT"; then
echo "Expected output '$EXPECTED_OUTPUT' not found in output"
exit 1
elif
echo "Success: '$EXPECTED_OUTPUT' found in output"
fi

if [ $EXIT_CODE -ne 0 ]; then
echo "Unexpected exit code: $EXIT_CODE"
exit $EXIT_CODE
fi
echo "::endgroup::"

popd
Loading
Loading