Skip to content

Commit f30e7b3

Browse files
author
Gang Fu
committed
add all files for Hyperpod lab for NxD llama3 model training
1 parent 97061c8 commit f30e7b3

File tree

7 files changed

+394
-2
lines changed

7 files changed

+394
-2
lines changed

labs/Hyperpod/.DS_Store

-6 KB
Binary file not shown.

labs/Hyperpod/Dockerfile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training-neuronx:2.7.0-neuronx-py310-sdk2.24.1-ubuntu22.04
2+
3+
RUN git clone https://github.com/aws-neuron/neuronx-distributed.git
4+
COPY ./src /workspace
5+
RUN cp -r neuronx-distributed/examples/training/llama/* workspace/
6+
RUN cp -r neuronx-distributed/examples/training/llama/tp_zero1_llama_hf_pretrain/* workspace/
7+
RUN cp -r neuronx-distributed/examples/training/llama/tp_zero1_llama_hf_pretrain/8B_config_llama3.1 workspace/config_8b_llama3.1
8+
RUN cp -r neuronx-distributed/examples/training/llama/tp_zero1_llama_hf_pretrain/8B_config_llama3 workspace/config_8b_llama3
9+
RUN mv workspace/tp_zero1_llama_hf_pretrain.py workspace/train.py
10+
11+
WORKDIR /workspace
12+
13+
RUN pip install -r requirements.txt

labs/Hyperpod/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ docker pull 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-training-neuron
1919
clone the repo and go to the folder:
2020
```bash
2121
cd ~
22-
git clone https://github.com/aws-samples/awsome-distributed-training/
23-
cd awsome-distributed-training/3.test_cases/pytorch/neuronx-distributed/llama3/kubernetes
22+
git clone https://github.com/aws-neuron/neuron-workshops
23+
cd neuron-workshops/labs/Hyperpod
2424
```
2525

2626
We will build docker image using the Dockerfile in this directory.

labs/Hyperpod/generate-jobspec.sh

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
3+
export AWS_REGION=$(aws ec2 describe-availability-zones --output text --query 'AvailabilityZones[0].[RegionName]')
4+
export ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
5+
export REGISTRY=${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/
6+
export IMAGE=llama3_trn
7+
export TAG=:latest
8+
export IMAGE_URI=${REGISTRY}${IMAGE}${TAG}
9+
10+
export JOB_NAME=trn1-llama3-training
11+
export NUM_NODES=1
12+
export INSTANCE_TYPE=ml.trn1.32xlarge
13+
export EFA_PER_NODE=8
14+
export NEURON_PER_NODE=16
15+
export FI_PROVIDER=efa
16+
17+
18+
export FSX_CLAIM=fsx-claim # Change this according to the pvc created.
19+
20+
# Tokenize_data configs
21+
22+
export HF_ACCESS_TOKEN=hf_xxxxxx
23+
export TOKENIZED_DATA_PATH=/fsx/tokenized_data
24+
export DATASET_NAME=wikicorpus
25+
export dATASET_CONFIG_NAME=raw_en
26+
export HF_MODEL_NAME=meta-llama/Meta-Llama-3-8B # change this to meta-llama/Meta-Llama-3-8B if you want to train llama3 8B model
27+
28+
29+
export NEURON_CACHE_DIR=/fsx/neuron_cache
30+
export CHECKPOINT_DIR=/fsx/checkpoints
31+
export NUM_KEPT_CHECKPOINTS=2
32+
export CHECKPOINT_FREQ=100
33+
export NUM_NODES=1
34+
export MAX_STEPS=1000
35+
export STEPS_THIS_RUN=100
36+
export BATCH_SIZE=1
37+
38+
export MODEL_PATH=config_8b_llama3
39+
40+
41+
cat tokenize_data.yaml-template | envsubst > tokenize_data.yaml
42+
43+
cat llama3_train.yaml-template | envsubst > llama3_train.yaml
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: etcd
5+
spec:
6+
ports:
7+
- name: etcd-client-port
8+
port: 2379
9+
protocol: TCP
10+
targetPort: 2379
11+
selector:
12+
app: etcd
13+
14+
---
15+
apiVersion: apps/v1
16+
kind: Deployment
17+
metadata:
18+
labels:
19+
app: etcd
20+
name: etcd
21+
spec:
22+
replicas: 1
23+
selector:
24+
matchLabels:
25+
app: etcd
26+
template:
27+
metadata:
28+
labels:
29+
app: etcd
30+
spec:
31+
containers:
32+
- name: etcd
33+
command: ["/usr/local/bin/etcd"]
34+
args:
35+
- "--data-dir"
36+
- "/var/lib/etcd"
37+
- "--enable-v2"
38+
- "--listen-client-urls"
39+
- "http://0.0.0.0:2379"
40+
- "--advertise-client-urls"
41+
- "http://0.0.0.0:2379"
42+
- "--initial-cluster-state"
43+
- "new"
44+
image: quay.io/coreos/etcd:v3.5.19
45+
ports:
46+
- containerPort: 2379
47+
name: client
48+
protocol: TCP
49+
- containerPort: 2380
50+
name: server
51+
protocol: TCP
52+
restartPolicy: Always
53+
---
54+
apiVersion: "kubeflow.org/v1"
55+
kind: PyTorchJob
56+
metadata:
57+
name: trn1-llama3
58+
spec:
59+
elasticPolicy:
60+
rdzvBackend: etcd
61+
rdzvHost: etcd
62+
rdzvPort: 2379
63+
minReplicas: 1
64+
maxReplicas: 64
65+
maxRestarts: 100
66+
metrics:
67+
- type: Resource
68+
resource:
69+
name: cpuyeah
70+
target:
71+
type: Utilization
72+
averageUtilization: 90
73+
pytorchReplicaSpecs:
74+
Worker:
75+
replicas: 1
76+
restartPolicy: OnFailure
77+
template:
78+
metadata:
79+
labels:
80+
app: trn1-llama3
81+
spec:
82+
volumes:
83+
- name: shmem
84+
hostPath:
85+
path: /dev/shm
86+
- name: persistent-storage
87+
persistentVolumeClaim:
88+
claimName: ${FSX_CLAIM}
89+
- name: local
90+
hostPath:
91+
path: /dev
92+
- name: hyperpod
93+
hostPath:
94+
path: /var/log/aws/clusters
95+
nodeSelector:
96+
node.kubernetes.io/instance-type: ${INSTANCE_TYPE}
97+
containers:
98+
- name: pytorch
99+
image: ${IMAGE_URI}
100+
imagePullPolicy: Always
101+
resources:
102+
requests:
103+
aws.amazon.com/neuron: ${NEURON_PER_NODE}
104+
vpc.amazonaws.com/efa: ${EFA_PER_NODE}
105+
limits:
106+
aws.amazon.com/neuron: ${NEURON_PER_NODE}
107+
vpc.amazonaws.com/efa: ${EFA_PER_NODE}
108+
env:
109+
- name: LOGLEVEL
110+
value: "DEBUG"
111+
- name: FI_PROVIDER
112+
value: efa
113+
- name: FI_EFA_USE_DEVICE_RDMA
114+
value: "1"
115+
- name: FI_EFA_FORK_SAFE
116+
value: "1"
117+
- name: FI_LOG_LEVEL
118+
value: "1"
119+
- name: FI_EFA_ENABLE_SHM_TRANSFER
120+
value: "1"
121+
- name: NEURON_RT_NUM_CORES
122+
value: "32"
123+
- name: NUM_NEURONCORES
124+
value: "32"
125+
- name: TPU_NUM_DEVICES
126+
value: "32"
127+
- name: TPU_CHIPS_PER_HOST_BOUNDS
128+
value: "32"
129+
- name: TORCH_NCCL_DEBUG_INFO_TEMP_FILE
130+
value: "/local/nccl_trace_rank_"
131+
- name: PYTORCH_CUDA_ALLOC_CONF
132+
value: "expandable_segments:True"
133+
- name: MALLOC_ARENA_MAX
134+
value: "64"
135+
- name: NCCL_SOCKET_IFNAME
136+
value: "^lo"
137+
- name: NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS
138+
value: "3"
139+
- name: NEURON_FUSE_SOFTMAX
140+
value: "1"
141+
- name: NEURON_CC_FLAGS
142+
value: "--model-type transformer --distribution-strategy=llm-training --cache_dir=${NEURON_CACHE_DIR}"
143+
command:
144+
- torchrun
145+
- --nproc_per_node=32
146+
- --nnodes=$NUM_NODES
147+
- train.py
148+
- --model_path=${MODEL_PATH}
149+
- --data_dir=${TOKENIZED_DATA_PATH}/${DATASET_NAME}_llama3_tokenized_8k
150+
- --tensor_parallel_size=32
151+
- --batch_size=${BATCH_SIZE}
152+
- --steps_this_run=${STEPS_THIS_RUN}
153+
- --max_steps=${MAX_STEPS}
154+
- --warmup_steps=100
155+
- --lr=1.5e-4
156+
- --grad_accum_usteps=16
157+
- --seq_len=8192
158+
- --sequence_parallel_enabled
159+
- --selective_checkpoint_enabled
160+
- --logging_interval=10
161+
- --qkv_linear
162+
- --kv_replicator=4
163+
- --use_flash_attention=1
164+
- --use_zero_1
165+
- --use_mix_precision
166+
- --checkpoint_freq=${CHECKPOINT_FREQ}
167+
- --num_kept_checkpoint=${NUM_KEPT_CHECKPOINTS}
168+
- --checkpoint_dir=${CHECKPOINT_DIR}
169+
volumeMounts:
170+
- name: shmem
171+
mountPath: /dev/shm
172+
- name: persistent-storage
173+
mountPath: /fsx
174+
- name: hyperpod
175+
mountPath: /var/log/aws/clusters

labs/Hyperpod/src/tokenize_data.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import subprocess
2+
def install_package(package_name):
3+
subprocess.run(["pip", "install", package_name])
4+
5+
# Example usage
6+
install_package("transformers==4.43.3")
7+
8+
import os
9+
import argparse
10+
from itertools import chain
11+
12+
from datasets import load_dataset
13+
from transformers import AutoTokenizer
14+
from huggingface_hub.hf_api import HfFolder;
15+
from huggingface_hub import snapshot_download
16+
17+
18+
parser = argparse.ArgumentParser()
19+
parser.add_argument('--llama-version', type=int, default=3, help='LLaMA version (default: 3)')
20+
parser.add_argument("--save_path", type=str,default=None, help="path to save the tokenized data")
21+
parser.add_argument("--dataset_name", type=str,default="wikicorpus", help="name of the dataset to use")
22+
parser.add_argument("--dataset_config_name", type=str,default="raw_en", help="dataset config to use")
23+
24+
parser.add_argument(
25+
"--model_name",
26+
type=str,
27+
help="Huggingface Model name",
28+
)
29+
parser.add_argument(
30+
"--cache_dir",
31+
type=str,
32+
help="Huggingface cache directory"
33+
)
34+
parser.add_argument(
35+
"--hf_access_token",
36+
type=str,
37+
help="HF access token.",
38+
)
39+
40+
args = parser.parse_args()
41+
llama_version = args.llama_version
42+
43+
print("*****Args passed by user*********")
44+
print(args)
45+
46+
print("Download tokenizer")
47+
if not os.path.exists(args.save_path):
48+
os.makedirs(args.save_path)
49+
50+
HfFolder.save_token(args.hf_access_token)
51+
snapshot_download(repo_id=args.model_name, allow_patterns=["tokenizer*"], ignore_patterns=["*.safetensors","*.safetensors.index.json"],local_dir=args.save_path,local_dir_use_symlinks=False)
52+
53+
block_size = 4096
54+
save_path = f"{args.save_path}/{args.dataset_name}_llama{llama_version}_tokenized_4k"
55+
if llama_version == 3:
56+
block_size = 8192
57+
save_path = f"{args.save_path}/{args.dataset_name}_llama{llama_version}_tokenized_8k"
58+
59+
tokenizer_path = args.save_path
60+
61+
save_path = os.path.expanduser(save_path)
62+
tokenizer_path = os.path.expanduser(tokenizer_path)
63+
64+
65+
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name,trust_remote_code=True)
66+
67+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
68+
69+
column_names = raw_datasets["train"].column_names
70+
text_column_name = "text" if "text" in column_names else column_names[0]
71+
72+
def tokenize_function(examples):
73+
return tokenizer(examples[text_column_name])
74+
75+
76+
tokenized_datasets = raw_datasets.map(
77+
tokenize_function,
78+
batched=True,
79+
remove_columns=column_names,
80+
load_from_cache_file=True,
81+
desc="Running tokenizer on dataset",
82+
)
83+
84+
if block_size > tokenizer.model_max_length:
85+
print("block_size > tokenizer.model_max_length")
86+
block_size = min(block_size, tokenizer.model_max_length)
87+
88+
89+
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
90+
def group_texts(examples):
91+
# Concatenate all texts.
92+
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
93+
total_length = len(concatenated_examples[list(examples.keys())[0]])
94+
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
95+
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
96+
total_length = (total_length // block_size) * block_size
97+
# Split by chunks of max_len.
98+
result = {
99+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items()
100+
}
101+
result["labels"] = result["input_ids"].copy()
102+
return result
103+
104+
105+
lm_datasets = tokenized_datasets.map(
106+
group_texts,
107+
batched=True,
108+
load_from_cache_file=True,
109+
desc=f"Grouping texts in chunks of {block_size}",
110+
)
111+
112+
train_dataset = lm_datasets["train"]
113+
print(len(train_dataset))
114+
115+
train_dataset.save_to_disk(save_path)

0 commit comments

Comments
 (0)