Skip to content

Commit cb63b87

Browse files
authored
Upgrade NVidia CUDA toolkit from 11.4.4 to 11.7.1 (#1519)
1 parent 43f2ada commit cb63b87

File tree

3 files changed

+29
-2
lines changed

3 files changed

+29
-2
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2323
**CHANGES**
2424
- Upgrade NVIDIA driver to version 470.141.03.
2525
- Upgrade NVIDIA Fabric Manager to version 470.141.03.
26+
- Upgrade NVIDIA CUDA Toolkit to version 11.7.1.
2627
- Disable cron job tasks man-db and mlocate, which may have a negative impact on node performance.
2728
- Add support for generating Slurm Configuration files for Compute Resources with Multiple Instance Types.
2829
- Reduce timeout from 50 to a maximum of 5min in case of DynamoDB connection issues at compute node bootstrap.

attributes/default.rb

+4-2
Original file line numberDiff line numberDiff line change
@@ -189,11 +189,13 @@
189189
# NVIDIA
190190
default['cluster']['nvidia']['enabled'] = 'no'
191191
default['cluster']['nvidia']['driver_version'] = '470.141.03'
192-
default['cluster']['nvidia']['cuda_version'] = '11.4'
192+
default['cluster']['nvidia']['cuda_version'] = '11.7'
193+
default['cluster']['nvidia']['cuda_samples_version'] = '11.6'
193194
default['cluster']['nvidia']['driver_url_architecture_id'] = arm_instance? ? 'aarch64' : 'x86_64'
194195
default['cluster']['nvidia']['cuda_url_architecture_id'] = arm_instance? ? 'linux_sbsa' : 'linux'
195196
default['cluster']['nvidia']['driver_url'] = "https://us.download.nvidia.com/tesla/#{node['cluster']['nvidia']['driver_version']}/NVIDIA-Linux-#{node['cluster']['nvidia']['driver_url_architecture_id']}-#{node['cluster']['nvidia']['driver_version']}.run"
196-
default['cluster']['nvidia']['cuda_url'] = "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_#{node['cluster']['nvidia']['cuda_url_architecture_id']}.run"
197+
default['cluster']['nvidia']['cuda_url'] = "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_#{node['cluster']['nvidia']['cuda_url_architecture_id']}.run"
198+
default['cluster']['nvidia']['cuda_samples_url'] = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{node['cluster']['nvidia']['cuda_samples_version']}.tar.gz"
197199

198200
# NVIDIA fabric-manager
199201
# The package name of Fabric Manager for alinux2 and centos7 is nvidia-fabric-manager-version

cookbooks/aws-parallelcluster-install/recipes/nvidia.rb

+24
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,30 @@
8383
creates "/usr/local/cuda-#{node['cluster']['nvidia']['cuda_version']}"
8484
end
8585

86+
# Get CUDA Sample Files
87+
cuda_samples_directory = "/usr/local/cuda-#{node['cluster']['nvidia']['cuda_version']}/samples"
88+
cuda_tmp_sample_file = "/tmp/cuda-sample.tar.gz"
89+
remote_file cuda_tmp_sample_file do
90+
source node['cluster']['nvidia']['cuda_samples_url']
91+
mode '0644'
92+
retries 3
93+
retry_delay 5
94+
not_if { ::File.exist?(cuda_samples_directory) }
95+
end
96+
97+
# Unpack CUDA Samples
98+
bash 'cuda.sample install' do
99+
user 'root'
100+
group 'root'
101+
cwd '/tmp'
102+
code <<-CUDA
103+
set -e
104+
tar xf "#{cuda_tmp_sample_file}" --directory "/usr/local/"
105+
rm -f "#{cuda_tmp_sample_file}"
106+
CUDA
107+
creates cuda_samples_directory
108+
end
109+
86110
cookbook_file 'blacklist-nouveau.conf' do
87111
source 'nvidia/blacklist-nouveau.conf'
88112
path '/etc/modprobe.d/blacklist-nouveau.conf'

0 commit comments

Comments
 (0)