Skip to content

Update comment for P5 FI_* in fsdp.yaml-template #22

Update comment for P5 FI_* in fsdp.yaml-template

Update comment for P5 FI_* in fsdp.yaml-template #22

name: FSDP Regression Test (EKS)
on:
pull_request:
paths:
- '3.test_cases/pytorch/FSDP/**'
workflow_dispatch:
jobs:
regression:
strategy:
fail-fast: true
max-parallel: 1
matrix:
cluster: [p5-eks]
model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
runs-on: [self-hosted, "${{ matrix.cluster }}"]
timeout-minutes: 360 # 6 hours for the full Llama 2 test
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
- name: Set env vars
run: |
BUILD_ID="${{ github.run_id }}"
FSDP_DIR="$(pwd)/${BUILD_ID}/3.test_cases/pytorch/FSDP"
# Set instance specific variables
if [[ "${{ matrix.cluster }}" == "p5-eks" ]]; then
EFA_PER_NODE=32
INSTANCE_TYPE="p5.48xlarge"
elif [[ "${{ matrix.cluster }}" == "p5en-eks" ]]; then
EFA_PER_NODE=16
INSTANCE_TYPE="p5en.48xlarge"
elif [[ "${{ matrix.cluster }}" == "p6-eks" ]]; then
EFA_PER_NODE=8
INSTANCE_TYPE="p6-b200.48xlarge"
else
EFA_PER_NODE=32 # Leaving 32 as default for now
INSTANCE_TYPE="p5.48xlarge" # Leaving p5 as default for now
fi
# Hardcoding these as 4 and 8 for p*
NUM_NODES=4
GPU_PER_NODE=8
echo "BUILD_ID=$BUILD_ID" >> $GITHUB_ENV
echo "FSDP_DIR=$FSDP_DIR" >> $GITHUB_ENV
echo "NUM_NODES=$NUM_NODES" >> $GITHUB_ENV
echo "GPU_PER_NODE=$GPU_PER_NODE" >> $GITHUB_ENV
echo "EFA_PER_NODE=$EFA_PER_NODE" >> $GITHUB_ENV
echo "INSTANCE_TYPE=$INSTANCE_TYPE" >> $GITHUB_ENV
echo "Env vars set successfully!"
- name: Build container image
working-directory: ${{ env.FSDP_DIR }}
run: |
echo "Building FSDP image"
AWS_REGION=$(aws ec2 describe-availability-zones --output text --query 'AvailabilityZones[0].[RegionName]')
ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
REGISTRY=${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/
echo "REGISTRY=${REGISTRY}" >> $GITHUB_ENV
sudo docker build -f Dockerfile -t ${REGISTRY}fsdp-regression:pytorch .
echo "FSDP image built!"
- name: Create ECR repository and push image
run: |
# Create registry if needed
REGISTRY_COUNT=$(aws ecr describe-repositories | grep \"fsdp-regression\" | wc -l)
if [ "${REGISTRY_COUNT}" == "0" ]; then
aws ecr create-repository --repository-name fsdp-regression
fi
# Login to registry
echo "Logging in to ${{ env.REGISTRY }}..."
aws ecr get-login-password | sudo docker login --username AWS --password-stdin ${{ env.REGISTRY }}
# Push image to registry
sudo docker image push ${{ env.REGISTRY }}fsdp-regression:pytorch
echo "Image pushed to ECR successfully!"
- name: Run training on EKS
id: run_test
working-directory: ${{ env.FSDP_DIR }}
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
# Use model specific YAML file
pushd kubernetes
YAML_FILE="${{ matrix.model_config }}-fsdp.yaml"
if [ ! -f "${YAML_FILE}" ]; then
echo "Error: YAML file ${YAML_FILE} does not exist!"
exit 1
fi
YAML_COPY="regression-${{ matrix.model_config }}-fsdp.yaml"
# Create copy for modification
cp "${YAML_FILE}" "${YAML_COPY}"
# Set env_vars for substitution
export IMAGE_URI=${{ env.REGISTRY }}fsdp-regression:pytorch
export INSTANCE_TYPE=${{ env.INSTANCE_TYPE }}
export NUM_NODES=${{ env.NUM_NODES }}
export GPU_PER_NODE=${{ env.GPU_PER_NODE }}
export EFA_PER_NODE=${{ env.EFA_PER_NODE }}
export FI_PROVIDER=efa
export HF_TOKEN=${{ secrets.HF_TOKEN }}
# Apply K8s manifest
echo "Applying model specific Kubernetes manifest..."
envsubst < "${YAML_COPY}" | kubectl apply -f -
popd
# Wait for training to start before monitoring
echo "Waiting for pods to become Ready using kubectl wait..."
# Fix job name (replace underscore with hyphen)
JOB_NAME="${{ matrix.model_config }}-fsdp"
ACTUAL_JOB_NAME=$(echo ${JOB_NAME} | tr '_' '-')
echo "JOB_NAME=$JOB_NAME" >> $GITHUB_ENV
echo "ACTUAL_JOB_NAME=$ACTUAL_JOB_NAME" >> $GITHUB_ENV
echo "Job name: $JOB_NAME"
echo "Actual job name: $ACTUAL_JOB_NAME"
kubectl wait --for=condition=Ready pod -l app=${ACTUAL_JOB_NAME} --timeout=10m
if [ $? -ne 0 ]; then
echo "Timed out waiting for pods to become Ready"
kubectl get pods -l app=${ACTUAL_JOB_NAME}
exit 1
fi
echo "All pods are running and ready!"
echo "PyTorch Job Name: $(kubectl get pytorchjob)"
echo "Job Pods: $(kubectl get pods)"
- name: Monitor training run
id: monitor_job
run: |
# Monitor until completion or err
echo "Monitoring job ${{ env.ACTUAL_JOB_NAME }}..."
start_time=$(date +%s)
timeout=21600
exit_code=0
expected_pods=${{ env.NUM_NODES }}
last_log_time=$(date +%s)
echo "Finding master pod for logs..."
WORKER_POD_0=$(kubectl get pods -l app=${{ env.ACTUAL_JOB_NAME }} -o name | head -1 | cut -d'/' -f2)
master_pod=$(kubectl logs $WORKER_POD_0 | grep master_addr= | head -1 | awk -F'=' '{print $2}' | tr -d '[:space:]')
if [ -z "$master_pod" ]; then
echo "Could not determine master pod, using first pod for logs"
master_pod=$WORKER_POD_0
fi
echo "Selected pod for logs: $master_pod"
while true; do
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ $elapsed_time -gt $timeout ]; then
echo "Monitoring timed out after 6 hours... Exiting"
exit_code=1
break
fi
# Check for error conditions in pods
container_statuses=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[*].status.containerStatuses[*].state}' 2>/dev/null)
pod_phases=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[*].status.phase}' 2>/dev/null)
if echo "$container_statuses" | grep -q "CrashLoopBackOff\|ImagePullBackOff\|ErrImagePull"; then
echo "Error detected in container status"
exit_code=1
break
fi
# Check for error states in pod phases
if echo "$pod_phases" | grep -q "Failed\|Unknown\|Evicted"; then
echo "Error detected in pod phase"
exit_code=1
break
fi
# Check PyTorchJob status
job_status=$(kubectl get pytorchjob ${{ env.ACTUAL_JOB_NAME }} -o jsonpath='{.status.conditions[0].type}' 2>/dev/null)
if [ "$job_status" == "Succeeded" ]; then
echo "Job completed successfully!"
exit_code=0
break
elif [ "$job_status" == "Failed" ]; then
echo "Job failed"
exit_code=1
break
fi
# Check if all pods are in Succeeded state
succeeded_pods=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[?(@.status.phase=="Succeeded")].metadata.name}' | wc -w)
total_pods=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} | grep -v NAME | wc -l)
running_pods=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' | wc -w)
# If all pods have succeeded, then consider the job complete and end loop.
if [ $total_pods -eq $expected_pods ] && [ $succeeded_pods -eq $total_pods ]; then
echo "All pods ($succeeded_pods/$total_pods) are in Succeeded state. Job completed successfully!"
exit_code=0
break
fi
# Check if all expected pods are running
if [ $total_pods -eq $expected_pods ] && [ $running_pods -eq $total_pods ]; then
echo "All expected pods (${running_pods}/${expected_pods}) are running"
echo "=== Recent logs from pod $master_pod ==="
kubectl logs $master_pod --tail=20
echo "=== End of recent logs ==="
last_log_time=$current_time
elif [ $total_pods -lt $expected_pods ]; then
echo "Warning: Only $total_pods pods found, expected $expected_pods"
elif [ $running_pods -lt $total_pods ]; then
echo "Warning: Only $running_pods/$total_pods pods are running"
# List pods that aren't running
kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{range .items[?(@.status.phase!="Running")]}{.metadata.name}{" is "}{.status.phase}{"\n"}{end}'
fi
echo "Job status: $running_pods/$total_pods pods running (elapsed: $elapsed_time seconds)"
sleep 60
done
echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
if [ $exit_code -ne 0 ]; then
echo "FSDP training on EKS failed with exit code: $exit_code"
exit $exit_code
fi
- name: Collect pod logs
if: always()
run: |
echo "Collecting pod logs from training run"
mkdir -p pod-logs
echo "Finding master pod..."
# Find the first worker pod to search for master_addr
WORKER_POD_0=$(kubectl get pods -l app=${{ env.ACTUAL_JOB_NAME }} -o name | head -1 | cut -d'/' -f2)
MASTER_POD=$(kubectl logs $WORKER_POD_0 | grep master_addr= | head -1 | awk -F'=' '{print $2}' | tr -d '[:space:]')
if [ -z "$MASTER_POD" ]; then
echo "Could not determine master pod, collecting logs from all pods"
for pod in $(kubectl get pods -l app=${{ env.ACTUAL_JOB_NAME }} -o name); do
pod_name=$(echo $pod | cut -d'/' -f2)
kubectl logs $pod_name > pod-logs/$pod_name.log
done
else
echo "Master pod is: $MASTER_POD"
kubectl logs $MASTER_POD > pod-logs/master-$MASTER_POD.log
echo "Master pod logs collected"
fi
- name: Upload logs as artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: regression-logs-${{ matrix.model_config }}-${{ matrix.cluster }}-${{ github.run_id }}
path: pod-logs
retention-days: 7
- name: Cleanup
if: always()
run: |
echo "Cleaning up..."
kubectl delete pytorchjob ${{ env.ACTUAL_JOB_NAME }} || true
rm -rf pod-logs
echo "Cleaned up successfully!"