Merge pull request #250 from AI-Hypercomputer:vij_moe_runner

jetstream authors · jetstream authors · commit c856af569868 · 2025-04-17T15:14:26.000-07:00
PiperOrigin-RevId: 748827578
diff --git a/.github/workflows/run_maxtext_jetstream_tests.yaml b/.github/workflows/run_maxtext_jetstream_tests.yaml
@@ -24,15 +24,15 @@ on:
   workflow_dispatch:
   schedule:
     # Run the job every 4 hours
-    - cron:  '0 */1 * * *'
+    - cron:  '0 */24 * * *'
 
 jobs:
   prelim:
     runs-on: ["self-hosted", "tpu", "v6e-8"]
     steps:
     - uses: actions/checkout@v4
-    - name: Test MOE Microbenchmarks
-      run: bash .github/workflows/test_moe_microbenchmarks.sh
+    - name: Test MOEBenchmarks
+      run: bash .github/workflows/test_moe_benchmarks.sh
       # run: bash .github/workflows/test_moe_8x22b_microbenchmark.sh
     # - name: Test MOE long context chunked prefill - 8k
     #   run: bash .github/workflows/benchmark_chunked_prefill.sh
@@ -50,19 +50,18 @@ jobs:
     - name: Log message if dependent job succeeded
       if: ${{ ! (failure() && github.event.pull_request == null) }}
       run: echo "Conditions for creating/updating issue not met. Skipping."
-    # - name: Send email
-    #   uses: dawidd6/action-send-mail@v3.6.0
-    #   with:
-    #     server_address: smtp.gmail.com
-    #     server_port: 465
-    #     username: ${{secrets.MAIL_USERNAME}}
-    #     password: ${{secrets.MAIL_PASSWORD}}
-    #     subject: Message from Inference Stable Stack Runs.
-    #     to: singhvijaya@google.com, yuyanpeng@google.com, vipannalla@google.com
-    #     from: InferenceStableStackRuns
-    #     secure: true
-    #     attachments: ~/test_dir/moe_8x7b_jetstream.txt
-    #     # attachments: ~/test_dir/moe_8x7b.txt,~/test_dir/moe_8x22b.txt,~/test_dir/moe_8x22b_long_context_8k_prefill.txt
-    #     body: workflow for ${{github.repository}} completed successfully!
+    - name: Send email
+      uses: dawidd6/action-send-mail@v3.6.0
+      with:
+        server_address: smtp.gmail.com
+        server_port: 465
+        username: ${{secrets.MAIL_USERNAME}}
+        password: ${{secrets.MAIL_PASSWORD}}
+        subject: Message from Inference Stable Stack Runs.
+        to: singhvijaya@google.com, yuyanpeng@google.com, vipannalla@google.com
+        from: JetStream Runs
+        secure: true
+        attachments: ~/test_dir/moe_8x7b.txt,~/test_dir/moe_8x22b.txt,~/test_dir/moe_8x22b_long_context_8k_prefill.txt,~/test_dir/moe_8x7b_jetstream.txt
+        body: workflow for ${{github.repository}} completed successfully!
     - name: Cleanup
       run: rm -rf ~/test_dir
diff --git a/.github/workflows/test_moe_benchmarks.sh b/.github/workflows/test_moe_benchmarks.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+mkdir ~/test_dir
+cd ~/test_dir
+git clone https://github.com/google/maxtext.git
+
+cd ~/test_dir
+git clone https://github.com/google/JetStream.git
+cd ~/test_dir
+sudo apt-get -y update
+sudo apt-get -y install python3.10-venv
+sudo apt-get -y install jq
+python -m venv .env
+source .env/bin/activate
+
+cd ~/test_dir
+cd JetStream
+pip install -e .
+cd benchmarks
+pip install -r requirements.in
+
+cd ~/test_dir
+cd maxtext/
+pip3 install wheel
+bash setup.sh MODE=stable DEVICE=tpu
+
+pip install nltk==3.8.1
+
+
+# moe 8x7b microbenchmark
+LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml tokenizer_path=assets/tokenizer.mistral-v1 max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x7b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=8 megablox=False quantization=int8 quantize_kvcache=False checkpoint_is_quantized=True load_parameters_path=gs://jetstream-runner/8-7B-int8 capacity_factor=1 attention=dot_product model_call_mode=inference sparse_matmul=False weight_dtype=bfloat16 > ~/test_dir/moe_8x7b.txt
+tail -n5 ~/test_dir/moe_8x7b.txt > ~/test_dir/moe_8x7b.tmp && mv ~/test_dir/moe_8x7b.tmp ~/test_dir/moe_8x7b.txt
+
+# moe 8x22B microbenchmark
+LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8  max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=True capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths="128,1024" sparse_matmul=False model_call_mode=inference > ~/test_dir/moe_8x22b.txt
+tail -n5 ~/test_dir/moe_8x22b.txt > ~/test_dir/moe_8x22b.tmp && mv ~/test_dir/moe_8x22b.tmp ~/test_dir/moe_8x22b.txt
+
+# moe 8x22B 8k context length chunked prefill with 2k prefill chunk size
+LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.benchmark_chunked_prefill MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8  max_prefill_predict_length=8192 max_target_length=9000 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=False capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths="8192" sparse_matmul=False model_call_mode=inference ici_context_autoregressive_parallelism=8 use_chunked_prefill=True prefill_chunk_size=2048 > ~/test_dir/moe_8x22b_long_context_8k_prefill.txt
+tail -n5 ~/test_dir/moe_8x22b_long_context_8k_prefill.txt > ~/test_dir/moe_8x22b_long_context_8k_prefill.tmp && mv ~/test_dir/moe_8x22b_long_context_8k_prefill.tmp ~/test_dir/moe_8x22b_long_context_8k_prefill.txt
+
+
+# moe 8x7B Maxtext Jetstream 
+
+LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.maxengine_server MaxText/configs/inference.yml tokenizer_path=assets/tokenizer.mistral-v1 max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x7b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=24 megablox=False quantization=int8 quantize_kvcache=True checkpoint_is_quantized=True load_parameters_path=gs://jetstream-runner/8-7B-int8 capacity_factor=1 attention=dot_product model_call_mode=inference sparse_matmul=False weight_dtype=bfloat16
+
+sleep 600
+
+cd ..
+
+python JetStream/benchmarks/benchmark_serving.py   --tokenizer ~/test_dir/maxtext/assets/tokenizer.mistral-v1 --save-result   --save-request-outputs   --request-outputs-file-path outputs.json   --num-prompts 1200   --max-output-length 1024  --dataset openorca --run-eval True > ~/test_dir/moe_8x7b_jetstream.txt
+tail -n10 ~/test_dir/moe_8x7b_jetstream.txt > ~/test_dir/moe_8x7b_jetstream.tmp && mv ~/test_dir/moe_8x7b_jetstream.tmp ~/test_dir/moe_8x7b_jetstream.txt
+
+# kill python jobs
+sudo kill -9 $(ps aux | grep python | awk '{print $2}')
diff --git a/.github/workflows/test_moe_microbenchmarks.sh b/.github/workflows/test_moe_microbenchmarks.sh