-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgres-gen
More file actions
162 lines (130 loc) · 3.85 KB
/
gres-gen
File metadata and controls
162 lines (130 loc) · 3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env bash
set -euo pipefail
readonly SCRIPT_NAME="${0##*/}"
readonly VERSION="1.0.0"
declare -g device_name="gpu"
declare -g header_file=""
declare -g autodetect_option=""
declare -g show_help=false
detect_hardware() {
local gpu_count
local cpu_count
if ! command -v nvidia-smi >/dev/null 2>&1; then
echo "Error: nvidia-smi not found. NVIDIA drivers may not be installed." >&2
exit 1
fi
gpu_count=$(nvidia-smi -L 2>/dev/null | wc -l)
cpu_count=$(nproc)
if [[ $gpu_count -eq 0 ]]; then
echo "Warning: No GPUs detected" >&2
fi
echo "$gpu_count $cpu_count"
}
show_usage() {
cat << EOF
$SCRIPT_NAME v$VERSION - Generate SLURM GRES configuration for GPU resources
USAGE:
$SCRIPT_NAME [OPTIONS]
OPTIONS:
-n, --name DEVICE Device name in GRES config (default: gpu)
-h, --header FILE Include header file content at top
-a, --autodetect OPT Add AutoDetect line with specified option
--help Show this help message
--version Show version information
EXAMPLES:
$SCRIPT_NAME
$SCRIPT_NAME --name tesla --header /etc/slurm/gres_header.conf
$SCRIPT_NAME --autodetect nvml
OUTPUT:
Generates GRES configuration mapping each GPU to balanced CPU core ranges
EOF
}
show_version() {
echo "$SCRIPT_NAME version $VERSION"
}
parse_arguments() {
while [[ $# -gt 0 ]]; do
case $1 in
-n|--name)
[[ -n ${2:-} ]] || { echo "Error: --name requires a value" >&2; exit 1; }
device_name="$2"
shift 2
;;
-h|--header)
[[ -n ${2:-} ]] || { echo "Error: --header requires a file path" >&2; exit 1; }
header_file="$2"
shift 2
;;
-a|--autodetect)
[[ -n ${2:-} ]] || { echo "Error: --autodetect requires a value" >&2; exit 1; }
autodetect_option="$2"
shift 2
;;
--help)
show_help=true
shift
;;
--version)
show_version
exit 0
;;
*)
echo "Error: Unknown option '$1'" >&2
echo "Use --help for usage information" >&2
exit 1
;;
esac
done
}
include_header() {
[[ -z $header_file ]] && return 0
if [[ ! -f $header_file ]]; then
echo "Error: Header file '$header_file' not found" >&2
exit 1
fi
if [[ ! -r $header_file ]]; then
echo "Error: Header file '$header_file' not readable" >&2
exit 1
fi
cat "$header_file"
echo
}
calculate_cpu_range() {
local gpu_id=$1
local total_gpus=$2
local total_cores=$3
local cores_per_gpu=$((total_cores / total_gpus))
local start_core=$((gpu_id * cores_per_gpu))
local end_core=$(((gpu_id + 1) * cores_per_gpu - 1))
if [[ $gpu_id -eq $((total_gpus - 1)) ]]; then
end_core=$((total_cores - 1))
fi
echo "$start_core-$end_core"
}
generate_gres_config() {
local hardware_info
local gpu_count
local cpu_count
hardware_info=$(detect_hardware)
read -r gpu_count cpu_count <<< "$hardware_info"
include_header
for gpu_id in $(seq 0 $((gpu_count - 1))); do
local cpu_range
cpu_range=$(calculate_cpu_range "$gpu_id" "$gpu_count" "$cpu_count")
printf "Name=%-10s File=/dev/nvidia%d CPUs=%s\n" \
"$device_name" "$gpu_id" "$cpu_range"
done
if [[ -n $autodetect_option ]]; then
echo
echo "AutoDetect=$autodetect_option"
fi
}
main() {
parse_arguments "$@"
if [[ $show_help == true ]]; then
show_usage
exit 0
fi
generate_gres_config
}
main "$@"