-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathall_train.slurm
72 lines (60 loc) · 1.52 KB
/
all_train.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/bash
#SBATCH --job-name=cocoa_training
#SBATCH --output=logs/cocoa_%A_%a.out
#SBATCH --error=logs/cocoa_%A_%a.err
#SBATCH --time=7:00:00
#SBATCH --cpus-per-task=8
#SBATCH --gres=gpu:a100:1
#SBATCH --mem=32G
#SBATCH --array=[1-21] # Run up to 4 jobs simultaneously (adjust based on your models count)
# Create logs directory
mkdir -p logs
# Array of models (must match your models.json)
declare -a models=(
"vit_large"
"vit_base"
"convnext_xxlarge"
"convnext_xlarge"
"swin_large"
"efficientnet_b7"
"efficientnet_b5"
"efficientnet_b3"
"efficientnet_b0"
"mobilenetv3_large"
"mobilenetv3_small"
"eva_giant"
"maxvit_xlarge"
"beit_large"
"resnet_152"
"resnet_101"
"resnet_50"
"resnet_34"
"vgg19"
"vgg16"
"vgg13"
)
# Get the model name for this array task
MODEL=${models[$SLURM_ARRAY_TASK_ID]}
# Convert underscore to hyphen for the script
MODEL_ARG=$(echo $MODEL | tr '_' '_')
# Load necessary modules and activate conda environment
module purge
# Activate your conda environment
hostname
nvidia-smi
pwd
conda activate test
echo "running python from...."
which python
# Print job information
echo "Job ID: $SLURM_JOB_ID"
echo "Array Task ID: $SLURM_ARRAY_TASK_ID"
echo "Model: $MODEL_ARG"
echo "Precision: $PRECISION"
echo "Running on node: $HOSTNAME"
echo "GPU allocated: $CUDA_VISIBLE_DEVICES"
# Run the training script
./train.sh -m $MODEL_ARG $PRECISION \
2>&1 | tee "logs/training_${MODEL_ARG}.log"
# Deactivate conda environment
conda deactivate