File tree 3 files changed +29
-2
lines changed
cookbooks/aws-parallelcluster-install/recipes
3 files changed +29
-2
lines changed Original file line number Diff line number Diff line change @@ -23,6 +23,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
23
23
** CHANGES**
24
24
- Upgrade NVIDIA driver to version 470.141.03.
25
25
- Upgrade NVIDIA Fabric Manager to version 470.141.03.
26
+ - Upgrade NVIDIA CUDA Toolkit to version 11.7.1.
26
27
- Disable cron job tasks man-db and mlocate, which may have a negative impact on node performance.
27
28
- Add support for generating Slurm Configuration files for Compute Resources with Multiple Instance Types.
28
29
- Reduce timeout from 50 to a maximum of 5min in case of DynamoDB connection issues at compute node bootstrap.
Original file line number Diff line number Diff line change 189
189
# NVIDIA
190
190
default [ 'cluster' ] [ 'nvidia' ] [ 'enabled' ] = 'no'
191
191
default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '470.141.03'
192
- default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] = '11.4'
192
+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] = '11.7'
193
+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_samples_version' ] = '11.6'
193
194
default [ 'cluster' ] [ 'nvidia' ] [ 'driver_url_architecture_id' ] = arm_instance? ? 'aarch64' : 'x86_64'
194
195
default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url_architecture_id' ] = arm_instance? ? 'linux_sbsa' : 'linux'
195
196
default [ 'cluster' ] [ 'nvidia' ] [ 'driver_url' ] = "https://us.download.nvidia.com/tesla/#{ node [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] } /NVIDIA-Linux-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'driver_url_architecture_id' ] } -#{ node [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] } .run"
196
- default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url' ] = "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url_architecture_id' ] } .run"
197
+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url' ] = "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url_architecture_id' ] } .run"
198
+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_samples_url' ] = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_samples_version' ] } .tar.gz"
197
199
198
200
# NVIDIA fabric-manager
199
201
# The package name of Fabric Manager for alinux2 and centos7 is nvidia-fabric-manager-version
Original file line number Diff line number Diff line change 83
83
creates "/usr/local/cuda-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] } "
84
84
end
85
85
86
+ # Get CUDA Sample Files
87
+ cuda_samples_directory = "/usr/local/cuda-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] } /samples"
88
+ cuda_tmp_sample_file = "/tmp/cuda-sample.tar.gz"
89
+ remote_file cuda_tmp_sample_file do
90
+ source node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_samples_url' ]
91
+ mode '0644'
92
+ retries 3
93
+ retry_delay 5
94
+ not_if { ::File . exist? ( cuda_samples_directory ) }
95
+ end
96
+
97
+ # Unpack CUDA Samples
98
+ bash 'cuda.sample install' do
99
+ user 'root'
100
+ group 'root'
101
+ cwd '/tmp'
102
+ code <<-CUDA
103
+ set -e
104
+ tar xf "#{ cuda_tmp_sample_file } " --directory "/usr/local/"
105
+ rm -f "#{ cuda_tmp_sample_file } "
106
+ CUDA
107
+ creates cuda_samples_directory
108
+ end
109
+
86
110
cookbook_file 'blacklist-nouveau.conf' do
87
111
source 'nvidia/blacklist-nouveau.conf'
88
112
path '/etc/modprobe.d/blacklist-nouveau.conf'
You can’t perform that action at this time.
0 commit comments