Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@ aws-parallelcluster-cookbook CHANGELOG

This file is used to list changes made in each version of the AWS ParallelCluster cookbook.

3.15.0
------

**CHANGES**
1. Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes
and achieve better performance at scale.


3.14.0
------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
dcv_port: node['cluster']['dcv_port'],
dcv_auth_certificate: node['cluster']['dcv']['authenticator']['certificate'],
dcv_auth_private_key: node['cluster']['dcv']['authenticator']['private_key'],
dcv_auth_user: node['cluster']['dcv']['authenticator']['user']
dcv_auth_user: node['cluster']['dcv']['authenticator']['user'],
cfnhup_enabled: cfnhup_enabled?
)
end
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,28 @@
end
end

context "when head node and cfn-hup disabled on fleet" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'HeadNode'
node.override['cluster']['dcv_enabled'] = 'head_node'
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
allow_any_instance_of(Object).to receive(:dcv_installed?).and_return(true)
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'has the correct content' do
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:cfn-hup]")
.with_content("[program:clustermgtd]")
.with_content("[program:clusterstatusmgtd]")
.with_content("[program:pcluster_dcv_authenticator]")
.with_content("--port 8444")
end
end

context "when compute fleet" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
Expand All @@ -77,6 +99,25 @@
.with_content("[program:pcluster_dcv_authenticator]")
end
end

context "when compute fleet with cfn-hup disabled on fleet" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'ComputeFleet'
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'has the correct content' do
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:computemgtd]")

is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:cfn-hup]")
end
end
context "when login node and dcv configured" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
Expand Down Expand Up @@ -109,12 +150,32 @@

it 'has the correct content' do
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:cfn-hup]")
.with_content("[program:loginmgtd]")

is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:pcluster_dcv_authenticator]")
end
end

context "when login node with cfn-hup disabled on fleet" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
node.override['cluster']['node_type'] = 'LoginNode'
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'has the correct content' do
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:loginmgtd]")

is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
.with_content("[program:cfn-hup]")
end
end
end
end
end
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# Generated by Chef for AWS ParallelCluster <%= node['cluster']['node_type'] -%>
# Local modifications could be overwritten.
<%# HeadNode, ComputeFleet, LoginNode -%>
<% case node['cluster']['node_type'] -%>
<% when 'HeadNode', 'ComputeFleet', 'LoginNode' -%>
<% if @cfnhup_enabled -%>
[program:cfn-hup]
command = <%= node['cluster']['scripts_dir']%>/cfn-hup-runner.sh
autorestart = true
Expand Down
3 changes: 3 additions & 0 deletions cookbooks/aws-parallelcluster-shared/attributes/cluster.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@

# Default NFS mount options
default['cluster']['nfs']['hard_mount_options'] = 'hard,_netdev,noatime'

# Cluster Updates
default['cluster']['in_place_update_on_fleet_enabled'] = 'true'
11 changes: 11 additions & 0 deletions cookbooks/aws-parallelcluster-shared/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,14 @@ def wait_sync_file(path)
timeout 5
end
end

def cfnhup_enabled?
# cfn-hup is always enabled on the head node, as it is required to perform cluster updates.
# cfn-hup can be disabled on compute nodes and login nodes, limiting the cluster update in the sense that
# live updates on compute and login nodes are not possible.
node['cluster']['node_type'] == 'HeadNode' || node['cluster']['in_place_update_on_fleet_enabled'] == 'true'
end

def cluster_readiness_check_on_update_enabled?
node['cluster']['in_place_update_on_fleet_enabled'] == 'true'
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
require_relative '../../../libraries/helpers'
require 'spec_helper'

describe 'cfnhup_enabled?' do
let(:node) { Chef::Node.new }

context 'when node type is HeadNode' do
before { node.override['cluster']['node_type'] = 'HeadNode' }

it 'returns true regardless of in_place_update_on_fleet_enabled setting' do
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
expect(cfnhup_enabled?).to be true
end
end

%w(ComputeFleet LoginNode).each do |node_type|
context "when node type is #{node_type}" do
before { node.override['cluster']['node_type'] = node_type }

it 'returns true when in_place_update_on_fleet_enabled is true' do
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'true'
expect(cfnhup_enabled?).to be true
end

it 'returns false when in_place_update_on_fleet_enabled is false' do
node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false'
expect(cfnhup_enabled?).to be false
end
end
end
end

describe 'cluster_readiness_check_on_update_enabled?' do
let(:node) { Chef::Node.new }

[true, false].each do |in_place_update_on_fleet_enabled|
it "returns #{in_place_update_on_fleet_enabled} when in_place_update_on_fleet_enabled is #{in_place_update_on_fleet_enabled}" do
node.override['cluster']['in_place_update_on_fleet_enabled'] = in_place_update_on_fleet_enabled.to_s
expect(cluster_readiness_check_on_update_enabled?).to be in_place_update_on_fleet_enabled
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def update_nodes_in_queue(strategy, queues)

chef_sleep '15'

wait_cluster_ready
wait_cluster_ready if cluster_readiness_check_on_update_enabled?

execute 'start clustermgtd' do
command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(are_mount_or_unmount_required)
allow_any_instance_of(Object).to receive(:dig).and_return(true)
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path)
allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(true)
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true

node.override['cluster']['stack_name'] = cluster_name
Expand Down Expand Up @@ -58,6 +59,27 @@
end
end
end

context 'when cluster readiness check is disabled' do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false)
allow_any_instance_of(Object).to receive(:dig).and_return(true)
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path)
allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(false)
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true

node.override['cluster']['stack_name'] = cluster_name
node.override['cluster']['region'] = region
node.override['cluster']['cluster_config_version'] = cluster_config_version
node.override['cluster']['scripts_dir'] = scripts_dir
end
runner.converge(described_recipe)
end
it 'does not check cluster readiness' do
is_expected.not_to run_execute("Check cluster readiness")
end
end
end
end
end
Loading