add all files for Hyperpod lab for NxD llama3 model training

Gang Fu · Gang Fu · commit f30e7b3f879e · 2025-08-22T14:45:47.000-07:00
diff --git a/labs/Hyperpod/.DS_Store b/labs/Hyperpod/.DS_Store
diff --git a/labs/Hyperpod/Dockerfile b/labs/Hyperpod/Dockerfile
@@ -0,0 +1,13 @@
+FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training-neuronx:2.7.0-neuronx-py310-sdk2.24.1-ubuntu22.04
+
+RUN git clone https://github.com/aws-neuron/neuronx-distributed.git
+COPY ./src /workspace
+RUN cp -r neuronx-distributed/examples/training/llama/* workspace/
+RUN cp -r neuronx-distributed/examples/training/llama/tp_zero1_llama_hf_pretrain/* workspace/
+RUN cp -r neuronx-distributed/examples/training/llama/tp_zero1_llama_hf_pretrain/8B_config_llama3.1 workspace/config_8b_llama3.1
+RUN cp -r neuronx-distributed/examples/training/llama/tp_zero1_llama_hf_pretrain/8B_config_llama3 workspace/config_8b_llama3
+RUN mv workspace/tp_zero1_llama_hf_pretrain.py workspace/train.py
+
+WORKDIR /workspace
+
+RUN pip install -r requirements.txt
diff --git a/labs/Hyperpod/README.md b/labs/Hyperpod/README.md
@@ -19,8 +19,8 @@ docker pull 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training-neuron
 clone the repo and go to the folder:
 ```bash
 cd ~
-git clone https://github.com/aws-samples/awsome-distributed-training/
-cd awsome-distributed-training/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes
+git clone https://github.com/aws-neuron/neuron-workshops
+cd neuron-workshops/labs/Hyperpod
 ```
 
 We will build docker image using the Dockerfile in this directory.
diff --git a/labs/Hyperpod/generate-jobspec.sh b/labs/Hyperpod/generate-jobspec.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+export AWS_REGION=$(aws ec2 describe-availability-zones --output text --query 'AvailabilityZones[0].[RegionName]')
+export ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
+export REGISTRY=${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/
+export IMAGE=llama3_trn
+export TAG=:latest
+export IMAGE_URI=${REGISTRY}${IMAGE}${TAG}
+
+export JOB_NAME=trn1-llama3-training
+export NUM_NODES=1
+export INSTANCE_TYPE=ml.trn1.32xlarge
+export EFA_PER_NODE=8
+export NEURON_PER_NODE=16
+export FI_PROVIDER=efa
+
+
+export FSX_CLAIM=fsx-claim # Change this according to the pvc created.
+
+# Tokenize_data configs
+
+export HF_ACCESS_TOKEN=hf_xxxxxx
+export TOKENIZED_DATA_PATH=/fsx/tokenized_data
+export DATASET_NAME=wikicorpus
+export dATASET_CONFIG_NAME=raw_en
+export HF_MODEL_NAME=meta-llama/Meta-Llama-3-8B # change this to meta-llama/Meta-Llama-3-8B if you want to train llama3 8B model
+
+
+export NEURON_CACHE_DIR=/fsx/neuron_cache
+export CHECKPOINT_DIR=/fsx/checkpoints
+export NUM_KEPT_CHECKPOINTS=2
+export CHECKPOINT_FREQ=100
+export NUM_NODES=1
+export MAX_STEPS=1000
+export STEPS_THIS_RUN=100
+export BATCH_SIZE=1
+
+export MODEL_PATH=config_8b_llama3
+
+
+cat tokenize_data.yaml-template | envsubst > tokenize_data.yaml
+
+cat llama3_train.yaml-template | envsubst > llama3_train.yaml
diff --git a/labs/Hyperpod/llama3_train.yaml-template b/labs/Hyperpod/llama3_train.yaml-template
@@ -0,0 +1,175 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: etcd
+spec:
+  ports:
+    - name: etcd-client-port
+      port: 2379
+      protocol: TCP
+      targetPort: 2379
+  selector:
+    app: etcd
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: etcd
+  name: etcd
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: etcd
+  template:
+    metadata:
+      labels:
+        app: etcd
+    spec:
+      containers:
+        - name: etcd
+          command: ["/usr/local/bin/etcd"]
+          args:
+            - "--data-dir"
+            - "/var/lib/etcd"
+            - "--enable-v2"
+            - "--listen-client-urls"
+            - "http://0.0.0.0:2379"
+            - "--advertise-client-urls"
+            - "http://0.0.0.0:2379"
+            - "--initial-cluster-state"
+            - "new"
+          image: quay.io/coreos/etcd:v3.5.19
+          ports:
+            - containerPort: 2379
+              name: client
+              protocol: TCP
+            - containerPort: 2380
+              name: server
+              protocol: TCP
+      restartPolicy: Always
+---
+apiVersion: "kubeflow.org/v1"
+kind: PyTorchJob
+metadata:
+  name: trn1-llama3
+spec:
+  elasticPolicy:
+    rdzvBackend: etcd
+    rdzvHost: etcd
+    rdzvPort: 2379
+    minReplicas: 1
+    maxReplicas: 64
+    maxRestarts: 100
+    metrics:
+      - type: Resource
+        resource:
+          name: cpuyeah
+          target:
+            type: Utilization
+            averageUtilization: 90
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: 1
+      restartPolicy: OnFailure
+      template:
+        metadata:
+          labels:
+            app: trn1-llama3
+        spec:
+          volumes:
+            - name: shmem
+              hostPath: 
+                path: /dev/shm
+            - name: persistent-storage
+              persistentVolumeClaim:
+                claimName: ${FSX_CLAIM}
+            - name: local
+              hostPath:
+                path: /dev
+            - name: hyperpod
+              hostPath:
+                path: /var/log/aws/clusters
+          nodeSelector:
+           node.kubernetes.io/instance-type: ${INSTANCE_TYPE}
+          containers:
+            - name: pytorch
+              image: ${IMAGE_URI}
+              imagePullPolicy: Always
+              resources:
+                requests:
+                  aws.amazon.com/neuron: ${NEURON_PER_NODE}
+                  vpc.amazonaws.com/efa: ${EFA_PER_NODE}
+                limits:
+                  aws.amazon.com/neuron: ${NEURON_PER_NODE}
+                  vpc.amazonaws.com/efa: ${EFA_PER_NODE}
+              env:
+              - name: LOGLEVEL
+                value: "DEBUG"
+              - name: FI_PROVIDER
+                value: efa
+              - name: FI_EFA_USE_DEVICE_RDMA
+                value: "1"
+              - name: FI_EFA_FORK_SAFE
+                value: "1"
+              - name: FI_LOG_LEVEL
+                value: "1"
+              - name: FI_EFA_ENABLE_SHM_TRANSFER
+                value: "1"
+              - name: NEURON_RT_NUM_CORES
+                value: "32"
+              - name: NUM_NEURONCORES
+                value: "32"
+              - name: TPU_NUM_DEVICES
+                value: "32"
+              - name: TPU_CHIPS_PER_HOST_BOUNDS
+                value: "32"
+              - name: TORCH_NCCL_DEBUG_INFO_TEMP_FILE
+                value: "/local/nccl_trace_rank_"
+              - name: PYTORCH_CUDA_ALLOC_CONF
+                value: "expandable_segments:True"
+              - name: MALLOC_ARENA_MAX
+                value: "64"
+              - name: NCCL_SOCKET_IFNAME
+                value: "^lo"
+              - name: NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS
+                value: "3"
+              - name: NEURON_FUSE_SOFTMAX
+                value: "1"
+              - name: NEURON_CC_FLAGS
+                value: "--model-type transformer --distribution-strategy=llm-training --cache_dir=${NEURON_CACHE_DIR}"
+              command: 
+                - torchrun
+                - --nproc_per_node=32
+                - --nnodes=$NUM_NODES
+                - train.py
+                - --model_path=${MODEL_PATH}
+                - --data_dir=${TOKENIZED_DATA_PATH}/${DATASET_NAME}_llama3_tokenized_8k
+                - --tensor_parallel_size=32
+                - --batch_size=${BATCH_SIZE}
+                - --steps_this_run=${STEPS_THIS_RUN}
+                - --max_steps=${MAX_STEPS}
+                - --warmup_steps=100
+                - --lr=1.5e-4
+                - --grad_accum_usteps=16
+                - --seq_len=8192
+                - --sequence_parallel_enabled
+                - --selective_checkpoint_enabled
+                - --logging_interval=10
+                - --qkv_linear
+                - --kv_replicator=4
+                - --use_flash_attention=1
+                - --use_zero_1
+                - --use_mix_precision
+                - --checkpoint_freq=${CHECKPOINT_FREQ}
+                - --num_kept_checkpoint=${NUM_KEPT_CHECKPOINTS}
+                - --checkpoint_dir=${CHECKPOINT_DIR}
+              volumeMounts:
+                - name: shmem
+                  mountPath: /dev/shm
+                - name: persistent-storage
+                  mountPath: /fsx
+                - name: hyperpod
+                  mountPath: /var/log/aws/clusters
diff --git a/labs/Hyperpod/src/tokenize_data.py b/labs/Hyperpod/src/tokenize_data.py
@@ -0,0 +1,115 @@
+import subprocess
+def install_package(package_name):
+    subprocess.run(["pip", "install", package_name])
+
+# Example usage
+install_package("transformers==4.43.3")
+
+import os
+import argparse
+from itertools import chain
+
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from huggingface_hub.hf_api import HfFolder;
+from huggingface_hub import snapshot_download
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--llama-version', type=int, default=3, help='LLaMA version (default: 3)')
+parser.add_argument("--save_path", type=str,default=None, help="path to save the tokenized data")
+parser.add_argument("--dataset_name", type=str,default="wikicorpus", help="name of the dataset to use")
+parser.add_argument("--dataset_config_name", type=str,default="raw_en", help="dataset config to use")
+
+parser.add_argument(
+        "--model_name",
+        type=str,
+        help="Huggingface Model name",
+    )
+parser.add_argument(
+        "--cache_dir",
+        type=str,
+        help="Huggingface cache directory"
+    )
+parser.add_argument(
+        "--hf_access_token",
+        type=str,
+        help="HF access token.",
+    )
+
+args = parser.parse_args()
+llama_version = args.llama_version
+
+print("*****Args passed by user*********")
+print(args)
+
+print("Download tokenizer")
+if not os.path.exists(args.save_path):
+    os.makedirs(args.save_path)
+
+HfFolder.save_token(args.hf_access_token)
+snapshot_download(repo_id=args.model_name, allow_patterns=["tokenizer*"], ignore_patterns=["*.safetensors","*.safetensors.index.json"],local_dir=args.save_path,local_dir_use_symlinks=False)
+
+block_size = 4096
+save_path = f"{args.save_path}/{args.dataset_name}_llama{llama_version}_tokenized_4k"
+if llama_version == 3:
+    block_size = 8192
+    save_path = f"{args.save_path}/{args.dataset_name}_llama{llama_version}_tokenized_8k"
+
+tokenizer_path = args.save_path
+
+save_path = os.path.expanduser(save_path)
+tokenizer_path = os.path.expanduser(tokenizer_path)
+
+
+raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name,trust_remote_code=True)
+
+tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+column_names = raw_datasets["train"].column_names
+text_column_name = "text" if "text" in column_names else column_names[0]
+
+def tokenize_function(examples):
+    return tokenizer(examples[text_column_name])
+    
+
+tokenized_datasets = raw_datasets.map(
+    tokenize_function,
+    batched=True,
+    remove_columns=column_names,
+    load_from_cache_file=True,
+    desc="Running tokenizer on dataset",
+)
+
+if block_size > tokenizer.model_max_length:
+    print("block_size > tokenizer.model_max_length")
+block_size = min(block_size, tokenizer.model_max_length)
+
+
+# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+def group_texts(examples):
+    # Concatenate all texts.
+    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+    total_length = len(concatenated_examples[list(examples.keys())[0]])
+    # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+    # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+    total_length = (total_length // block_size) * block_size
+    # Split by chunks of max_len.
+    result = {
+        k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items()
+    }
+    result["labels"] = result["input_ids"].copy()
+    return result
+
+
+lm_datasets = tokenized_datasets.map(
+    group_texts,
+    batched=True,
+    load_from_cache_file=True,
+    desc=f"Grouping texts in chunks of {block_size}",
+)
+
+train_dataset = lm_datasets["train"]
+print(len(train_dataset))
+
+train_dataset.save_to_disk(save_path)
diff --git a/labs/Hyperpod/tokenize_data.yaml-template b/labs/Hyperpod/tokenize_data.yaml-template