Skip to content

feat(archon): improve pipeline parallelism memory handling #542

feat(archon): improve pipeline parallelism memory handling

feat(archon): improve pipeline parallelism memory handling #542

Workflow file for this run

name: AReaL CI on GCP runner
on:
pull_request:
branches: [main]
types: [labeled]
workflow_dispatch:
workflow_call:
inputs:
image_tag:
description: 'Docker image tag to use for testing'
required: false
type: string
default: 'dev'
concurrency:
group: areal-unit-tests-${{ github.ref }}
cancel-in-progress: true
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
RUNNER_LABELS: gcp-a2-highgpu-2g
RUNNER_VERSION: '2.317.0'
GCP_OS_IMAGE: areal-cicd-test-202602030
CONTAINER_IMAGE: ghcr.io/inclusionai/areal-runtime:${{ inputs.image_tag || 'dev' }}
jobs:
provision-runner:
if: |
github.event_name == 'workflow_call' ||
contains(github.event.pull_request.labels.*.name, 'safe-to-test') ||
github.event_name == 'workflow_dispatch'
name: Provision GCP runner with 2 A100 GPUs
runs-on: ubuntu-latest
outputs:
instance_name: ${{ steps.vars.outputs.instance_name }}
instance_zone: ${{ steps.create-instance.outputs.zone }}
steps:
- name: Set instance variables
id: vars
run: |
echo "instance_name=gcp-runner-${{ github.run_id }}" >> "$GITHUB_OUTPUT"
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v2
- name: Fetch GitHub runner token
id: runner-token
uses: actions/github-script@v7
env:
GH_PAT: ${{ secrets.GH_PAT }}
with:
github-token: ${{ secrets.GH_PAT }}
script: |
const pat = process.env.GH_PAT;
if (!pat) {
core.setFailed('GH_PAT secret is not configured.');
return;
}
const tokenResponse = await github.rest.actions.createRegistrationTokenForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
request: {
headers: {
authorization: `token ${pat}`,
},
},
});
core.setOutput('token', tokenResponse.data.token);
- name: Render startup script and metadata files
env:
RUNNER_VERSION: ${{ env.RUNNER_VERSION }}
TOKEN: ${{ steps.runner-token.outputs.token }}
run: |
cat <<'EOF' > startup-script.template
#!/bin/bash
set -euo pipefail
exec > >(tee /var/log/github-runner-startup.log) 2>&1
RUNNER_VERSION="__RUNNER_VERSION__"
CONTAINER_NAME="areal-cicd"
CONTAINER_IMAGE="__CONTAINER_IMAGE__"
RUNNER_DIR="/opt/actions-runner"
# apt-get update
# apt-get install -y curl jq
# systemctl enable docker
# systemctl start docker
if ! docker ps --format '{{.Names}}' | grep -qx "$CONTAINER_NAME"; then
docker run --name "$CONTAINER_NAME" -d -it \
-e LC_ALL=C.UTF-8 \
-e LANG=C.UTF-8 \
--ulimit nofile=1048576:1048576 \
--shm-size="58205394001.92b" \
--runtime=nvidia \
--gpus all \
--net=host \
--cap-add=SYS_ADMIN \
--device=/dev/fuse \
--security-opt=apparmor:unconfined \
-v /storage:/storage \
--entrypoint=/bin/bash \
"$CONTAINER_IMAGE" \
-lc "trap : TERM INT; sleep infinity & wait"
fi
docker exec "$CONTAINER_NAME" bash -lc "
set -euo pipefail
RUNNER_VERSION=\"__RUNNER_VERSION__\"
RUNNER_DIR=\"/opt/actions-runner\"
REPO=\$(curl -fsSL -H \"Metadata-Flavor: Google\" \"http://metadata.google.internal/computeMetadata/v1/instance/attributes/repo\")
TOKEN=\$(curl -fsSL -H \"Metadata-Flavor: Google\" \"http://metadata.google.internal/computeMetadata/v1/instance/attributes/runner_token\")
LABELS=\$(curl -fsSL -H \"Metadata-Flavor: Google\" \"http://metadata.google.internal/computeMetadata/v1/instance/attributes/runner_labels\")
apt-get update
apt-get install -y sudo
if ! id runner >/dev/null 2>&1; then
useradd --home-dir \"\${RUNNER_DIR}\" --create-home --shell /bin/bash runner
fi
cd \"\${RUNNER_DIR}\"
curl -sSLO \"https://github.com/actions/runner/releases/download/v\${RUNNER_VERSION}/actions-runner-linux-x64-\${RUNNER_VERSION}.tar.gz\"
tar xzf \"actions-runner-linux-x64-\${RUNNER_VERSION}.tar.gz\"
chown -R runner:runner \"\${RUNNER_DIR}\"
sudo -u runner ./config.sh \\
--url \"https://github.com/\${REPO}\" \\
--token \"\${TOKEN}\" \\
--labels \"\${LABELS}\" \\
--unattended \\
--ephemeral
sudo -u runner nohup ./run.sh >/opt/actions-runner/runner.log 2>&1 &
"
EOF
sed -e "s/__RUNNER_VERSION__/${RUNNER_VERSION}/g" -e "s#__CONTAINER_IMAGE__#${CONTAINER_IMAGE}#g" startup-script.template > startup-script.sh
rm startup-script.template
printf '%s' "$TOKEN" > runner-token.txt
- name: Create runner instance
id: create-instance
env:
INSTANCE_NAME: ${{ steps.vars.outputs.instance_name }}
run: |
set -euo pipefail
zones=$(gcloud compute zones list --project "$GCP_PROJECT_ID" --filter="status=UP" --format="value(name)")
if [ -z "$zones" ]; then
echo "No available zones found." >&2
exit 1
fi
for zone in $zones; do
echo "Attempting to create instance in $zone..."
if gcloud compute instances create "$INSTANCE_NAME" \
--project "$GCP_PROJECT_ID" \
--zone "$zone" \
--machine-type "a2-highgpu-2g" \
--image "$GCP_OS_IMAGE" \
--maintenance-policy TERMINATE \
--restart-on-failure \
--max-run-duration "2h" \
--instance-termination-action DELETE \
--scopes "https://www.googleapis.com/auth/cloud-platform" \
--metadata repo=${{ github.repository }},runner_labels=${RUNNER_LABELS} \
--metadata-from-file startup-script=startup-script.sh,runner_token=runner-token.txt
then
echo "Successfully created instance in $zone."
echo "zone=$zone" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "Failed to create instance in $zone, trying next zone." >&2
done
echo "Unable to create instance in any available zone." >&2
exit 1
- name: Remove local runner artifacts
if: always()
run: rm -f startup-script.sh runner-token.txt
- name: Wait for runner to register
uses: actions/github-script@v7
env:
INSTANCE_NAME: ${{ steps.vars.outputs.instance_name }}
GH_PAT: ${{ secrets.GH_PAT }}
with:
github-token: ${{ secrets.GH_PAT }}
script: |
const instanceName = process.env.INSTANCE_NAME;
const maxAttempts = 200;
const delayMs = 15000;
const pat = process.env.GH_PAT;
if (!pat) {
core.setFailed('GH_PAT secret is not configured.');
return;
}
const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const response = await github.rest.actions.listSelfHostedRunnersForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
per_page: 100,
request: {
headers: {
authorization: `token ${pat}`,
},
},
});
const found = response.data.runners.find((runner) => runner.name === instanceName);
if (found && found.status === 'online') {
core.info(`Runner ${instanceName} is online.`);
return;
}
core.info(`Runner ${instanceName} not ready yet (attempt ${attempt}/${maxAttempts}).`);
await wait(delayMs);
}
throw new Error(`Timed out waiting for runner ${instanceName} to come online.`);
unit-tests:
if: |
github.event_name == 'workflow_call' ||
contains(github.event.pull_request.labels.*.name, 'safe-to-test') ||
github.event_name == 'workflow_dispatch'
needs:
- provision-runner
name: Run AReaL tests
environment:
name: AReaL-unittests
permissions:
contents: read
runs-on: [self-hosted, gcp-a2-highgpu-2g]
timeout-minutes: 120
env:
# Activate the venv created in the Docker image
VIRTUAL_ENV: /AReaL/.venv
steps:
- uses: actions/checkout@v4
- name: Validate Docker installation
run: |
export PATH="/AReaL/.venv/bin:$PATH"
python areal/tools/validate_docker_installation.py
- name: Run unit tests
env:
CI: true
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PYTHONPATH: ${{ github.workspace }}
TOKENIZERS_PARALLELISM: false
AREAL_IS_IN_CI: 1
VIRTUAL_ENV: /AReaL/.venv
run: |
export PATH="/AReaL/.venv/bin:$PATH"
pytest -m "not slow or ci" --durations=20 -s -vv tests/test_*.py tests/experimental/
- name: Run SFT integration tests
env:
CI: true
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PYTHONPATH: ${{ github.workspace }}
TOKENIZERS_PARALLELISM: false
VIRTUAL_ENV: /AReaL/.venv
run: |
export PATH="/AReaL/.venv/bin:$PATH"
pytest -s -vv tests/sft/
- name: Run GRPO integration tests
env:
CI: true
HF_TOKEN: ${{ secrets.HF_TOKEN }}
PYTHONPATH: ${{ github.workspace }}
TOKENIZERS_PARALLELISM: false
VIRTUAL_ENV: /AReaL/.venv
run: |
export PATH="/AReaL/.venv/bin:$PATH"
pytest -s -vv tests/grpo/
cleanup:
name: Tear down GCP runner
needs:
- unit-tests
- provision-runner
if: always()
runs-on: ubuntu-latest
env:
INSTANCE_NAME: ${{ needs.provision-runner.outputs.instance_name }}
INSTANCE_ZONE: ${{ needs.provision-runner.outputs.instance_zone }}
steps:
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GCP_SERVICE_ACCOUNT_KEY }}
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v2
- name: Delete runner instance
run: |
if [ -z "$INSTANCE_NAME" ] || [ -z "$INSTANCE_ZONE" ]; then
echo "No instance details recorded; skipping cleanup."
exit 0
fi
if gcloud compute instances describe "$INSTANCE_NAME" --project "$GCP_PROJECT_ID" --zone "$INSTANCE_ZONE" >/dev/null 2>&1; then
gcloud compute instances delete "$INSTANCE_NAME" --project "$GCP_PROJECT_ID" --zone "$INSTANCE_ZONE" --quiet
else
echo "Instance $INSTANCE_NAME already removed."
fi