Update comment for P5 FI_* in fsdp.yaml-template #22
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: FSDP Regression Test (EKS) | |
| on: | |
| pull_request: | |
| paths: | |
| - '3.test_cases/pytorch/FSDP/**' | |
| workflow_dispatch: | |
| jobs: | |
| regression: | |
| strategy: | |
| fail-fast: true | |
| max-parallel: 1 | |
| matrix: | |
| cluster: [p5-eks] | |
| model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b] | |
| runs-on: [self-hosted, "${{ matrix.cluster }}"] | |
| timeout-minutes: 360 # 6 hours for the full Llama 2 test | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| path: ${{ github.run_id }} | |
| - name: Set env vars | |
| run: | | |
| BUILD_ID="${{ github.run_id }}" | |
| FSDP_DIR="$(pwd)/${BUILD_ID}/3.test_cases/pytorch/FSDP" | |
| # Set instance specific variables | |
| if [[ "${{ matrix.cluster }}" == "p5-eks" ]]; then | |
| EFA_PER_NODE=32 | |
| INSTANCE_TYPE="p5.48xlarge" | |
| elif [[ "${{ matrix.cluster }}" == "p5en-eks" ]]; then | |
| EFA_PER_NODE=16 | |
| INSTANCE_TYPE="p5en.48xlarge" | |
| elif [[ "${{ matrix.cluster }}" == "p6-eks" ]]; then | |
| EFA_PER_NODE=8 | |
| INSTANCE_TYPE="p6-b200.48xlarge" | |
| else | |
| EFA_PER_NODE=32 # Leaving 32 as default for now | |
| INSTANCE_TYPE="p5.48xlarge" # Leaving p5 as default for now | |
| fi | |
| # Hardcoding these as 4 and 8 for p* | |
| NUM_NODES=4 | |
| GPU_PER_NODE=8 | |
| echo "BUILD_ID=$BUILD_ID" >> $GITHUB_ENV | |
| echo "FSDP_DIR=$FSDP_DIR" >> $GITHUB_ENV | |
| echo "NUM_NODES=$NUM_NODES" >> $GITHUB_ENV | |
| echo "GPU_PER_NODE=$GPU_PER_NODE" >> $GITHUB_ENV | |
| echo "EFA_PER_NODE=$EFA_PER_NODE" >> $GITHUB_ENV | |
| echo "INSTANCE_TYPE=$INSTANCE_TYPE" >> $GITHUB_ENV | |
| echo "Env vars set successfully!" | |
| - name: Build container image | |
| working-directory: ${{ env.FSDP_DIR }} | |
| run: | | |
| echo "Building FSDP image" | |
| AWS_REGION=$(aws ec2 describe-availability-zones --output text --query 'AvailabilityZones[0].[RegionName]') | |
| ACCOUNT=$(aws sts get-caller-identity --query Account --output text) | |
| REGISTRY=${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/ | |
| echo "REGISTRY=${REGISTRY}" >> $GITHUB_ENV | |
| sudo docker build -f Dockerfile -t ${REGISTRY}fsdp-regression:pytorch . | |
| echo "FSDP image built!" | |
| - name: Create ECR repository and push image | |
| run: | | |
| # Create registry if needed | |
| REGISTRY_COUNT=$(aws ecr describe-repositories | grep \"fsdp-regression\" | wc -l) | |
| if [ "${REGISTRY_COUNT}" == "0" ]; then | |
| aws ecr create-repository --repository-name fsdp-regression | |
| fi | |
| # Login to registry | |
| echo "Logging in to ${{ env.REGISTRY }}..." | |
| aws ecr get-login-password | sudo docker login --username AWS --password-stdin ${{ env.REGISTRY }} | |
| # Push image to registry | |
| sudo docker image push ${{ env.REGISTRY }}fsdp-regression:pytorch | |
| echo "Image pushed to ECR successfully!" | |
| - name: Run training on EKS | |
| id: run_test | |
| working-directory: ${{ env.FSDP_DIR }} | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| # Use model specific YAML file | |
| pushd kubernetes | |
| YAML_FILE="${{ matrix.model_config }}-fsdp.yaml" | |
| if [ ! -f "${YAML_FILE}" ]; then | |
| echo "Error: YAML file ${YAML_FILE} does not exist!" | |
| exit 1 | |
| fi | |
| YAML_COPY="regression-${{ matrix.model_config }}-fsdp.yaml" | |
| # Create copy for modification | |
| cp "${YAML_FILE}" "${YAML_COPY}" | |
| # Set env_vars for substitution | |
| export IMAGE_URI=${{ env.REGISTRY }}fsdp-regression:pytorch | |
| export INSTANCE_TYPE=${{ env.INSTANCE_TYPE }} | |
| export NUM_NODES=${{ env.NUM_NODES }} | |
| export GPU_PER_NODE=${{ env.GPU_PER_NODE }} | |
| export EFA_PER_NODE=${{ env.EFA_PER_NODE }} | |
| export FI_PROVIDER=efa | |
| export HF_TOKEN=${{ secrets.HF_TOKEN }} | |
| # Apply K8s manifest | |
| echo "Applying model specific Kubernetes manifest..." | |
| envsubst < "${YAML_COPY}" | kubectl apply -f - | |
| popd | |
| # Wait for training to start before monitoring | |
| echo "Waiting for pods to become Ready using kubectl wait..." | |
| # Fix job name (replace underscore with hyphen) | |
| JOB_NAME="${{ matrix.model_config }}-fsdp" | |
| ACTUAL_JOB_NAME=$(echo ${JOB_NAME} | tr '_' '-') | |
| echo "JOB_NAME=$JOB_NAME" >> $GITHUB_ENV | |
| echo "ACTUAL_JOB_NAME=$ACTUAL_JOB_NAME" >> $GITHUB_ENV | |
| echo "Job name: $JOB_NAME" | |
| echo "Actual job name: $ACTUAL_JOB_NAME" | |
| kubectl wait --for=condition=Ready pod -l app=${ACTUAL_JOB_NAME} --timeout=10m | |
| if [ $? -ne 0 ]; then | |
| echo "Timed out waiting for pods to become Ready" | |
| kubectl get pods -l app=${ACTUAL_JOB_NAME} | |
| exit 1 | |
| fi | |
| echo "All pods are running and ready!" | |
| echo "PyTorch Job Name: $(kubectl get pytorchjob)" | |
| echo "Job Pods: $(kubectl get pods)" | |
| - name: Monitor training run | |
| id: monitor_job | |
| run: | | |
| # Monitor until completion or err | |
| echo "Monitoring job ${{ env.ACTUAL_JOB_NAME }}..." | |
| start_time=$(date +%s) | |
| timeout=21600 | |
| exit_code=0 | |
| expected_pods=${{ env.NUM_NODES }} | |
| last_log_time=$(date +%s) | |
| echo "Finding master pod for logs..." | |
| WORKER_POD_0=$(kubectl get pods -l app=${{ env.ACTUAL_JOB_NAME }} -o name | head -1 | cut -d'/' -f2) | |
| master_pod=$(kubectl logs $WORKER_POD_0 | grep master_addr= | head -1 | awk -F'=' '{print $2}' | tr -d '[:space:]') | |
| if [ -z "$master_pod" ]; then | |
| echo "Could not determine master pod, using first pod for logs" | |
| master_pod=$WORKER_POD_0 | |
| fi | |
| echo "Selected pod for logs: $master_pod" | |
| while true; do | |
| current_time=$(date +%s) | |
| elapsed_time=$((current_time - start_time)) | |
| if [ $elapsed_time -gt $timeout ]; then | |
| echo "Monitoring timed out after 6 hours... Exiting" | |
| exit_code=1 | |
| break | |
| fi | |
| # Check for error conditions in pods | |
| container_statuses=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[*].status.containerStatuses[*].state}' 2>/dev/null) | |
| pod_phases=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[*].status.phase}' 2>/dev/null) | |
| if echo "$container_statuses" | grep -q "CrashLoopBackOff\|ImagePullBackOff\|ErrImagePull"; then | |
| echo "Error detected in container status" | |
| exit_code=1 | |
| break | |
| fi | |
| # Check for error states in pod phases | |
| if echo "$pod_phases" | grep -q "Failed\|Unknown\|Evicted"; then | |
| echo "Error detected in pod phase" | |
| exit_code=1 | |
| break | |
| fi | |
| # Check PyTorchJob status | |
| job_status=$(kubectl get pytorchjob ${{ env.ACTUAL_JOB_NAME }} -o jsonpath='{.status.conditions[0].type}' 2>/dev/null) | |
| if [ "$job_status" == "Succeeded" ]; then | |
| echo "Job completed successfully!" | |
| exit_code=0 | |
| break | |
| elif [ "$job_status" == "Failed" ]; then | |
| echo "Job failed" | |
| exit_code=1 | |
| break | |
| fi | |
| # Check if all pods are in Succeeded state | |
| succeeded_pods=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[?(@.status.phase=="Succeeded")].metadata.name}' | wc -w) | |
| total_pods=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} | grep -v NAME | wc -l) | |
| running_pods=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' | wc -w) | |
| # If all pods have succeeded, then consider the job complete and end loop. | |
| if [ $total_pods -eq $expected_pods ] && [ $succeeded_pods -eq $total_pods ]; then | |
| echo "All pods ($succeeded_pods/$total_pods) are in Succeeded state. Job completed successfully!" | |
| exit_code=0 | |
| break | |
| fi | |
| # Check if all expected pods are running | |
| if [ $total_pods -eq $expected_pods ] && [ $running_pods -eq $total_pods ]; then | |
| echo "All expected pods (${running_pods}/${expected_pods}) are running" | |
| echo "=== Recent logs from pod $master_pod ===" | |
| kubectl logs $master_pod --tail=20 | |
| echo "=== End of recent logs ===" | |
| last_log_time=$current_time | |
| elif [ $total_pods -lt $expected_pods ]; then | |
| echo "Warning: Only $total_pods pods found, expected $expected_pods" | |
| elif [ $running_pods -lt $total_pods ]; then | |
| echo "Warning: Only $running_pods/$total_pods pods are running" | |
| # List pods that aren't running | |
| kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{range .items[?(@.status.phase!="Running")]}{.metadata.name}{" is "}{.status.phase}{"\n"}{end}' | |
| fi | |
| echo "Job status: $running_pods/$total_pods pods running (elapsed: $elapsed_time seconds)" | |
| sleep 60 | |
| done | |
| echo "exit_code=$exit_code" >> $GITHUB_OUTPUT | |
| if [ $exit_code -ne 0 ]; then | |
| echo "FSDP training on EKS failed with exit code: $exit_code" | |
| exit $exit_code | |
| fi | |
| - name: Collect pod logs | |
| if: always() | |
| run: | | |
| echo "Collecting pod logs from training run" | |
| mkdir -p pod-logs | |
| echo "Finding master pod..." | |
| # Find the first worker pod to search for master_addr | |
| WORKER_POD_0=$(kubectl get pods -l app=${{ env.ACTUAL_JOB_NAME }} -o name | head -1 | cut -d'/' -f2) | |
| MASTER_POD=$(kubectl logs $WORKER_POD_0 | grep master_addr= | head -1 | awk -F'=' '{print $2}' | tr -d '[:space:]') | |
| if [ -z "$MASTER_POD" ]; then | |
| echo "Could not determine master pod, collecting logs from all pods" | |
| for pod in $(kubectl get pods -l app=${{ env.ACTUAL_JOB_NAME }} -o name); do | |
| pod_name=$(echo $pod | cut -d'/' -f2) | |
| kubectl logs $pod_name > pod-logs/$pod_name.log | |
| done | |
| else | |
| echo "Master pod is: $MASTER_POD" | |
| kubectl logs $MASTER_POD > pod-logs/master-$MASTER_POD.log | |
| echo "Master pod logs collected" | |
| fi | |
| - name: Upload logs as artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: regression-logs-${{ matrix.model_config }}-${{ matrix.cluster }}-${{ github.run_id }} | |
| path: pod-logs | |
| retention-days: 7 | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| echo "Cleaning up..." | |
| kubectl delete pytorchjob ${{ env.ACTUAL_JOB_NAME }} || true | |
| rm -rf pod-logs | |
| echo "Cleaned up successfully!" |