Update comment for P5 FI_* in fsdp.yaml-template #22

Workflow file for this run

.github/workflows/fsdp-eks-regression.yml at 269046f

	name: FSDP Regression Test (EKS)

	on:
	pull_request:
	paths:
	- '3.test_cases/pytorch/FSDP/**'

	workflow_dispatch:

	jobs:
	regression:
	strategy:
	fail-fast: true
	max-parallel: 1
	matrix:
	cluster: [p5-eks]
	model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
	runs-on: [self-hosted, "${{ matrix.cluster }}"]
	timeout-minutes: 360 # 6 hours for the full Llama 2 test
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	path: ${{ github.run_id }}

	- name: Set env vars
	run: \|
	BUILD_ID="${{ github.run_id }}"
	FSDP_DIR="$(pwd)/${BUILD_ID}/3.test_cases/pytorch/FSDP"

	# Set instance specific variables
	if [[ "${{ matrix.cluster }}" == "p5-eks" ]]; then
	EFA_PER_NODE=32
	INSTANCE_TYPE="p5.48xlarge"
	elif [[ "${{ matrix.cluster }}" == "p5en-eks" ]]; then
	EFA_PER_NODE=16
	INSTANCE_TYPE="p5en.48xlarge"
	elif [[ "${{ matrix.cluster }}" == "p6-eks" ]]; then
	EFA_PER_NODE=8
	INSTANCE_TYPE="p6-b200.48xlarge"
	else
	EFA_PER_NODE=32 # Leaving 32 as default for now
	INSTANCE_TYPE="p5.48xlarge" # Leaving p5 as default for now
	fi

	# Hardcoding these as 4 and 8 for p*
	NUM_NODES=4
	GPU_PER_NODE=8

	echo "BUILD_ID=$BUILD_ID" >> $GITHUB_ENV
	echo "FSDP_DIR=$FSDP_DIR" >> $GITHUB_ENV
	echo "NUM_NODES=$NUM_NODES" >> $GITHUB_ENV
	echo "GPU_PER_NODE=$GPU_PER_NODE" >> $GITHUB_ENV
	echo "EFA_PER_NODE=$EFA_PER_NODE" >> $GITHUB_ENV
	echo "INSTANCE_TYPE=$INSTANCE_TYPE" >> $GITHUB_ENV
	echo "Env vars set successfully!"

	- name: Build container image
	working-directory: ${{ env.FSDP_DIR }}
	run: \|
	echo "Building FSDP image"
	AWS_REGION=$(aws ec2 describe-availability-zones --output text --query 'AvailabilityZones[0].[RegionName]')
	ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
	REGISTRY=${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/
	echo "REGISTRY=${REGISTRY}" >> $GITHUB_ENV

	sudo docker build -f Dockerfile -t ${REGISTRY}fsdp-regression:pytorch .
	echo "FSDP image built!"

	- name: Create ECR repository and push image
	run: \|
	# Create registry if needed
	REGISTRY_COUNT=$(aws ecr describe-repositories \| grep \"fsdp-regression\" \| wc -l)
	if [ "${REGISTRY_COUNT}" == "0" ]; then
	aws ecr create-repository --repository-name fsdp-regression
	fi

	# Login to registry
	echo "Logging in to ${{ env.REGISTRY }}..."
	aws ecr get-login-password \| sudo docker login --username AWS --password-stdin ${{ env.REGISTRY }}

	# Push image to registry
	sudo docker image push ${{ env.REGISTRY }}fsdp-regression:pytorch
	echo "Image pushed to ECR successfully!"

	- name: Run training on EKS
	id: run_test
	working-directory: ${{ env.FSDP_DIR }}
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	# Use model specific YAML file
	pushd kubernetes
	YAML_FILE="${{ matrix.model_config }}-fsdp.yaml"

	if [ ! -f "${YAML_FILE}" ]; then
	echo "Error: YAML file ${YAML_FILE} does not exist!"
	exit 1
	fi

	YAML_COPY="regression-${{ matrix.model_config }}-fsdp.yaml"

	# Create copy for modification
	cp "${YAML_FILE}" "${YAML_COPY}"

	# Set env_vars for substitution
	export IMAGE_URI=${{ env.REGISTRY }}fsdp-regression:pytorch
	export INSTANCE_TYPE=${{ env.INSTANCE_TYPE }}
	export NUM_NODES=${{ env.NUM_NODES }}
	export GPU_PER_NODE=${{ env.GPU_PER_NODE }}
	export EFA_PER_NODE=${{ env.EFA_PER_NODE }}
	export FI_PROVIDER=efa
	export HF_TOKEN=${{ secrets.HF_TOKEN }}

	# Apply K8s manifest
	echo "Applying model specific Kubernetes manifest..."
	envsubst < "${YAML_COPY}" \| kubectl apply -f -
	popd

	# Wait for training to start before monitoring
	echo "Waiting for pods to become Ready using kubectl wait..."

	# Fix job name (replace underscore with hyphen)
	JOB_NAME="${{ matrix.model_config }}-fsdp"
	ACTUAL_JOB_NAME=$(echo ${JOB_NAME} \| tr '_' '-')
	echo "JOB_NAME=$JOB_NAME" >> $GITHUB_ENV
	echo "ACTUAL_JOB_NAME=$ACTUAL_JOB_NAME" >> $GITHUB_ENV

	echo "Job name: $JOB_NAME"
	echo "Actual job name: $ACTUAL_JOB_NAME"

	kubectl wait --for=condition=Ready pod -l app=${ACTUAL_JOB_NAME} --timeout=10m
	if [ $? -ne 0 ]; then
	echo "Timed out waiting for pods to become Ready"
	kubectl get pods -l app=${ACTUAL_JOB_NAME}
	exit 1
	fi
	echo "All pods are running and ready!"

	echo "PyTorch Job Name: $(kubectl get pytorchjob)"
	echo "Job Pods: $(kubectl get pods)"

	- name: Monitor training run
	id: monitor_job
	run: \|
	# Monitor until completion or err
	echo "Monitoring job ${{ env.ACTUAL_JOB_NAME }}..."

	start_time=$(date +%s)
	timeout=21600
	exit_code=0
	expected_pods=${{ env.NUM_NODES }}
	last_log_time=$(date +%s)

	echo "Finding master pod for logs..."
	WORKER_POD_0=$(kubectl get pods -l app=${{ env.ACTUAL_JOB_NAME }} -o name \| head -1 \| cut -d'/' -f2)
	master_pod=$(kubectl logs $WORKER_POD_0 \| grep master_addr= \| head -1 \| awk -F'=' '{print $2}' \| tr -d '[:space:]')

	if [ -z "$master_pod" ]; then
	echo "Could not determine master pod, using first pod for logs"
	master_pod=$WORKER_POD_0
	fi

	echo "Selected pod for logs: $master_pod"

	while true; do
	current_time=$(date +%s)
	elapsed_time=$((current_time - start_time))

	if [ $elapsed_time -gt $timeout ]; then
	echo "Monitoring timed out after 6 hours... Exiting"
	exit_code=1
	break
	fi

	# Check for error conditions in pods
	container_statuses=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[].status.containerStatuses[].state}' 2>/dev/null)
	pod_phases=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[*].status.phase}' 2>/dev/null)

	if echo "$container_statuses" \| grep -q "CrashLoopBackOff\\|ImagePullBackOff\\|ErrImagePull"; then
	echo "Error detected in container status"
	exit_code=1
	break
	fi

	# Check for error states in pod phases
	if echo "$pod_phases" \| grep -q "Failed\\|Unknown\\|Evicted"; then
	echo "Error detected in pod phase"
	exit_code=1
	break
	fi

	# Check PyTorchJob status
	job_status=$(kubectl get pytorchjob ${{ env.ACTUAL_JOB_NAME }} -o jsonpath='{.status.conditions[0].type}' 2>/dev/null)
	if [ "$job_status" == "Succeeded" ]; then
	echo "Job completed successfully!"
	exit_code=0
	break
	elif [ "$job_status" == "Failed" ]; then
	echo "Job failed"
	exit_code=1
	break
	fi

	# Check if all pods are in Succeeded state
	succeeded_pods=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[?(@.status.phase=="Succeeded")].metadata.name}' \| wc -w)
	total_pods=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} \| grep -v NAME \| wc -l)
	running_pods=$(kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{.items[?(@.status.phase=="Running")].metadata.name}' \| wc -w)

	# If all pods have succeeded, then consider the job complete and end loop.
	if [ $total_pods -eq $expected_pods ] && [ $succeeded_pods -eq $total_pods ]; then
	echo "All pods ($succeeded_pods/$total_pods) are in Succeeded state. Job completed successfully!"
	exit_code=0
	break
	fi

	# Check if all expected pods are running
	if [ $total_pods -eq $expected_pods ] && [ $running_pods -eq $total_pods ]; then
	echo "All expected pods (${running_pods}/${expected_pods}) are running"
	echo "=== Recent logs from pod $master_pod ==="
	kubectl logs $master_pod --tail=20
	echo "=== End of recent logs ==="
	last_log_time=$current_time
	elif [ $total_pods -lt $expected_pods ]; then
	echo "Warning: Only $total_pods pods found, expected $expected_pods"
	elif [ $running_pods -lt $total_pods ]; then
	echo "Warning: Only $running_pods/$total_pods pods are running"
	# List pods that aren't running
	kubectl get pods -l app=${ACTUAL_JOB_NAME} -o jsonpath='{range .items[?(@.status.phase!="Running")]}{.metadata.name}{" is "}{.status.phase}{"\n"}{end}'
	fi

	echo "Job status: $running_pods/$total_pods pods running (elapsed: $elapsed_time seconds)"
	sleep 60
	done

	echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
	if [ $exit_code -ne 0 ]; then
	echo "FSDP training on EKS failed with exit code: $exit_code"
	exit $exit_code
	fi

	- name: Collect pod logs
	if: always()
	run: \|
	echo "Collecting pod logs from training run"
	mkdir -p pod-logs

	echo "Finding master pod..."

	# Find the first worker pod to search for master_addr
	WORKER_POD_0=$(kubectl get pods -l app=${{ env.ACTUAL_JOB_NAME }} -o name \| head -1 \| cut -d'/' -f2)
	MASTER_POD=$(kubectl logs $WORKER_POD_0 \| grep master_addr= \| head -1 \| awk -F'=' '{print $2}' \| tr -d '[:space:]')

	if [ -z "$MASTER_POD" ]; then
	echo "Could not determine master pod, collecting logs from all pods"
	for pod in $(kubectl get pods -l app=${{ env.ACTUAL_JOB_NAME }} -o name); do
	pod_name=$(echo $pod \| cut -d'/' -f2)
	kubectl logs $pod_name > pod-logs/$pod_name.log
	done
	else
	echo "Master pod is: $MASTER_POD"
	kubectl logs $MASTER_POD > pod-logs/master-$MASTER_POD.log
	echo "Master pod logs collected"
	fi

	- name: Upload logs as artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: regression-logs-${{ matrix.model_config }}-${{ matrix.cluster }}-${{ github.run_id }}
	path: pod-logs
	retention-days: 7

	- name: Cleanup
	if: always()
	run: \|
	echo "Cleaning up..."
	kubectl delete pytorchjob ${{ env.ACTUAL_JOB_NAME }} \|\| true
	rm -rf pod-logs
	echo "Cleaned up successfully!"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update comment for P5 FI_* in fsdp.yaml-template #22

Workflow file

Update comment for P5 FI_* in fsdp.yaml-template #22

Uh oh!

Workflow file for this run