Skip to content

Commit 371a15e

Browse files
Add validation to make sure the ODCR has the right instance type and availability zone
Signed-off-by: Hanwen <[email protected]>
1 parent 7cd66eb commit 371a15e

File tree

9 files changed

+284
-1
lines changed

9 files changed

+284
-1
lines changed

cli/src/pcluster/aws/aws_api.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from pcluster.aws.imagebuilder import ImageBuilderClient
2121
from pcluster.aws.kms import KmsClient
2222
from pcluster.aws.logs import LogsClient
23+
from pcluster.aws.resource_groups import ResourceGroupsClient
2324
from pcluster.aws.route53 import Route53Client
2425
from pcluster.aws.s3 import S3Client
2526
from pcluster.aws.s3_resource import S3Resource
@@ -57,6 +58,7 @@ def __init__(self):
5758
self._logs = None
5859
self._route53 = None
5960
self._secretsmanager = None
61+
self._resource_groups = None
6062

6163
@property
6264
def cfn(self):
@@ -163,6 +165,13 @@ def secretsmanager(self):
163165
self._secretsmanager = SecretsManagerClient()
164166
return self._secretsmanager
165167

168+
@property
169+
def resource_groups(self):
170+
"""Resource Groups client."""
171+
if not self._resource_groups:
172+
self._resource_groups = ResourceGroupsClient()
173+
return self._resource_groups
174+
166175
@staticmethod
167176
def instance():
168177
"""Return the singleton AWSApi instance."""

cli/src/pcluster/aws/ec2.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def __init__(self):
3535
self.additional_instance_types_data = {}
3636
self.security_groups_cache = {}
3737
self.subnets_cache = {}
38+
self.capacity_reservations_cache = {}
3839

3940
@AWSExceptionHandler.handle_client_exception
4041
@Cache.cached
@@ -83,6 +84,30 @@ def describe_subnets(self, subnet_ids):
8384
result.append(subnet)
8485
return result
8586

87+
@AWSExceptionHandler.handle_client_exception
88+
def describe_capacity_reservations(self, capacity_reservation_ids):
89+
"""Return a list of Capacity Reservations."""
90+
result = []
91+
missed_capacity_reservations = []
92+
for capacity_reservation_id in capacity_reservation_ids:
93+
cached_data = self.capacity_reservations_cache.get(capacity_reservation_id)
94+
if cached_data:
95+
result.append(cached_data)
96+
else:
97+
missed_capacity_reservations.append(capacity_reservation_id)
98+
if missed_capacity_reservations:
99+
response = list(
100+
self._paginate_results(
101+
self._client.describe_capacity_reservations, CapacityReservationIds=missed_capacity_reservations
102+
)
103+
)
104+
for capacity_reservation in response:
105+
self.capacity_reservations_cache[
106+
capacity_reservation.get("CapacityReservationId")
107+
] = capacity_reservation
108+
result.append(capacity_reservation)
109+
return result
110+
86111
@AWSExceptionHandler.handle_client_exception
87112
@Cache.cached
88113
def get_subnet_avail_zone(self, subnet_id):
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4+
# with the License. A copy of the License is located at
5+
#
6+
# http://aws.amazon.com/apache2.0/
7+
#
8+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
9+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
import re
12+
13+
from pcluster.aws.common import AWSExceptionHandler, Boto3Client, Cache
14+
15+
16+
class ResourceGroupsClient(Boto3Client):
17+
"""Implement Resource Groups Boto3 client."""
18+
19+
def __init__(self):
20+
super().__init__("resource-groups")
21+
22+
@AWSExceptionHandler.handle_client_exception
23+
@Cache.cached
24+
def get_capacity_reservation_ids_from_group_resources(self, group):
25+
"""Return a list of capacity reservation ids."""
26+
capacity_reservation_ids = []
27+
resources = self._client.list_group_resources(Group=group)["Resources"]
28+
for resource in resources:
29+
if resource["Identifier"]["ResourceType"] == "AWS::EC2::CapacityReservation":
30+
capacity_reservation_ids.append(re.match("(.*)(cr-.*)", resource["Identifier"]["ResourceArn"]).group(2))
31+
return capacity_reservation_ids

cli/src/pcluster/config/cluster_config.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@
114114
)
115115
from pcluster.validators.ec2_validators import (
116116
AmiOsCompatibleValidator,
117+
CapacityReservationResourceGroupValidator,
118+
CapacityReservationValidator,
117119
CapacityTypeValidator,
118120
InstanceTypeBaseAMICompatibleValidator,
119121
InstanceTypeMemoryInfoValidator,
@@ -2439,6 +2441,8 @@ def __init__(self, cluster_name: str, scheduling: SlurmScheduling, **kwargs):
24392441
super().__init__(cluster_name, **kwargs)
24402442
self.scheduling = scheduling
24412443
self.__image_dict = None
2444+
# Cache capacity reservations information together to reduce number of boto3 calls
2445+
AWSApi.instance().ec2.describe_capacity_reservations(self.all_relevant_capacity_reservation_ids)
24422446

24432447
def get_instance_types_data(self):
24442448
"""Get instance type infos for all instance types used in the configuration file."""
@@ -2477,6 +2481,27 @@ def _register_validators(self):
24772481
instance_type=instance_type,
24782482
instance_type_data=instance_types_data[compute_resource.instance_type],
24792483
)
2484+
self._register_validator(
2485+
InstanceTypeMemoryInfoValidator,
2486+
instance_type=compute_resource.instance_type,
2487+
instance_type_data=instance_types_data[compute_resource.instance_type],
2488+
)
2489+
# The validation below has to be in cluster config class instead of queue class
2490+
# to make sure the subnet APIs are cached by previous validations.
2491+
if compute_resource.capacity_reservation_target:
2492+
cr_target = compute_resource.capacity_reservation_target
2493+
self._register_validator(
2494+
CapacityReservationValidator,
2495+
capacity_reservation_id=cr_target.capacity_reservation_id,
2496+
instance_type=compute_resource.instance_type,
2497+
subnet=queue.networking.subnet_ids[0],
2498+
)
2499+
self._register_validator(
2500+
CapacityReservationResourceGroupValidator,
2501+
capacity_reservation_resource_group_arn=cr_target.capacity_reservation_resource_group_arn,
2502+
instance_type=compute_resource.instance_type,
2503+
subnet=queue.networking.subnet_ids[0],
2504+
)
24802505

24812506
@property
24822507
def image_dict(self):
@@ -2519,3 +2544,15 @@ def capacity_reservation_resource_group_arns(self):
25192544
if capacity_reservation_target.capacity_reservation_resource_group_arn:
25202545
result.add(capacity_reservation_target.capacity_reservation_resource_group_arn)
25212546
return list(result)
2547+
2548+
@property
2549+
def all_relevant_capacity_reservation_ids(self):
2550+
"""Return a list of capacity reservation ids specified in the config or used by resource groups."""
2551+
capacity_reservation_ids = set(self.capacity_reservation_ids)
2552+
for capacity_reservation_resource_group_arn in self.capacity_reservation_resource_group_arns:
2553+
capacity_reservation_ids.update(
2554+
AWSApi.instance().resource_groups.get_capacity_reservation_ids_from_group_resources(
2555+
capacity_reservation_resource_group_arn
2556+
)
2557+
)
2558+
return list(capacity_reservation_ids)

cli/src/pcluster/validators/ec2_validators.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,3 +165,49 @@ def _validate(self, os: str, image_id: str):
165165
f"they are compatible before cluster creation and update operations.",
166166
FailureLevel.WARNING,
167167
)
168+
169+
170+
class CapacityReservationValidator(Validator):
171+
"""Validate capacity reservation can be used with the instance type and subnet."""
172+
173+
def _validate(self, capacity_reservation_id: str, instance_type: str, subnet: str):
174+
if capacity_reservation_id:
175+
capacity_reservation = AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation_id])[0]
176+
if capacity_reservation["InstanceType"] != instance_type:
177+
self._add_failure(
178+
f"Capacity reservation {capacity_reservation_id} must has the same instance type "
179+
f"as {instance_type}.",
180+
FailureLevel.ERROR,
181+
)
182+
if capacity_reservation["AvailabilityZone"] != AWSApi.instance().ec2.get_subnet_avail_zone(subnet):
183+
self._add_failure(
184+
f"Capacity reservation {capacity_reservation_id} must use the same availability zone "
185+
f"as subnet {subnet}.",
186+
FailureLevel.ERROR,
187+
)
188+
189+
190+
class CapacityReservationResourceGroupValidator(Validator):
191+
"""Validate at least one capacity reservation in the resource group can be used with the instance and subnet."""
192+
193+
def _validate(self, capacity_reservation_resource_group_arn: str, instance_type: str, subnet: str):
194+
if capacity_reservation_resource_group_arn:
195+
capacity_reservation_ids = (
196+
AWSApi.instance().resource_groups.get_capacity_reservation_ids_from_group_resources(
197+
capacity_reservation_resource_group_arn
198+
)
199+
)
200+
capacity_reservations = AWSApi.instance().ec2.describe_capacity_reservations(capacity_reservation_ids)
201+
found_qualified_capacity_reservation = False
202+
for capacity_reservation in capacity_reservations:
203+
if capacity_reservation["InstanceType"] == instance_type and capacity_reservation[
204+
"AvailabilityZone"
205+
] == AWSApi.instance().ec2.get_subnet_avail_zone(subnet):
206+
found_qualified_capacity_reservation = True
207+
break
208+
if not found_qualified_capacity_reservation:
209+
self._add_failure(
210+
f"Capacity reservation resource group {capacity_reservation_resource_group_arn} must have at least "
211+
f"one capacity reservation for {instance_type} in the same availability zone as subnet {subnet}.",
212+
FailureLevel.ERROR,
213+
)

cli/tests/pcluster/aws/dummy_aws_api.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from pcluster.aws.imagebuilder import ImageBuilderClient
2121
from pcluster.aws.kms import KmsClient
2222
from pcluster.aws.logs import LogsClient
23+
from pcluster.aws.resource_groups import ResourceGroupsClient
2324
from pcluster.aws.route53 import Route53Client
2425
from pcluster.aws.s3 import S3Client
2526
from pcluster.aws.s3_resource import S3Resource
@@ -100,6 +101,7 @@ def __init__(self):
100101
self._logs = _DummyLogsClient()
101102
self._ddb_resource = _DummyDynamoResource()
102103
self._route53 = _DummyRoute53Client()
104+
self._resource_groups = _DummyResourceGroupsClient()
103105

104106

105107
class _DummyCfnClient(CfnClient):
@@ -111,7 +113,12 @@ def __init__(self):
111113
class _DummyEc2Client(Ec2Client):
112114
def __init__(self):
113115
"""Override Parent constructor. No real boto3 client is created."""
114-
pass
116+
self.capacity_reservations_cache = {
117+
"cr-54321fcdbd5971234": {"InstanceType": "t2.micro", "AvailabilityZone": "string"},
118+
"cr-321456cdbd597f551": {"InstanceType": "t2.micro", "AvailabilityZone": "string"},
119+
"cr-123": {"InstanceType": "t2.micro", "AvailabilityZone": "string"},
120+
"cr-234": {"InstanceType": "t2.micro", "AvailabilityZone": "string"},
121+
}
115122

116123
def get_official_image_id(self, os, architecture, filters=None):
117124
return "dummy-ami-id"
@@ -263,6 +270,16 @@ def __init__(self):
263270
pass
264271

265272

273+
class _DummyResourceGroupsClient(ResourceGroupsClient):
274+
def __init__(self):
275+
"""Override Parent constructor. No real boto3 client is created."""
276+
pass
277+
278+
def get_capacity_reservation_ids_from_group_resources(self, group):
279+
"""Return a list of capacity reservation ids."""
280+
return ["cr-123", "cr-234"]
281+
282+
266283
def mock_aws_api(mocker, mock_instance_type_info=True):
267284
"""Mock AWS Api."""
268285
mocker.patch("pcluster.aws.aws_api.AWSApi.instance", return_value=_DummyAWSApi())

cli/tests/pcluster/aws/test_ec2.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,53 @@ def test_describe_subnets_cache(boto3_stubber):
368368
assert_that(AWSApi.instance().ec2.describe_subnets([subnet])[0]["State"]).is_equal_to("available")
369369

370370

371+
def get_describe_capacity_reservation_mocked_request(capacity_reservations, state):
372+
return MockedBoto3Request(
373+
method="describe_capacity_reservations",
374+
response={
375+
"CapacityReservations": [
376+
{"CapacityReservationId": capacity_reservation, "State": state}
377+
for capacity_reservation in capacity_reservations
378+
]
379+
},
380+
expected_params={"CapacityReservationIds": capacity_reservations},
381+
)
382+
383+
384+
def test_describe_capacity_reservations_cache(boto3_stubber):
385+
# First boto3 call. Nothing has been cached
386+
capacity_reservation = "cr-123"
387+
additional_capacity_reservation = "cr-234"
388+
# The first mocked request and the third are about the same cr. However, the state of the cr changes
389+
# from pending to available. The second mocked request is about another cr
390+
mocked_requests = [
391+
get_describe_capacity_reservation_mocked_request([capacity_reservation], "pending"),
392+
get_describe_capacity_reservation_mocked_request([additional_capacity_reservation], "pending"),
393+
get_describe_capacity_reservation_mocked_request([capacity_reservation], "active"),
394+
]
395+
boto3_stubber("ec2", mocked_requests)
396+
assert_that(AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation])[0]["State"]).is_equal_to(
397+
"pending"
398+
)
399+
400+
# Second boto3 call with more subnets. The cr already cached should not be included in the boto3 call.
401+
response = AWSApi.instance().ec2.describe_capacity_reservations(
402+
[capacity_reservation, additional_capacity_reservation]
403+
)
404+
assert_that(response).is_length(2)
405+
406+
# Third boto3 call. The result should be from cache even if the state of the cr is different
407+
assert_that(AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation])[0]["State"]).is_equal_to(
408+
"pending"
409+
)
410+
411+
# Fourth boto3 call after resetting the AWSApi instance. The latest cr state should be retrieved from boto3
412+
AWSApi.reset()
413+
assert_that(AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation])[0]["State"]).is_equal_to(
414+
"active"
415+
)
416+
417+
371418
def get_describe_security_groups_mocked_request(security_groups, ip_permissions):
372419
return MockedBoto3Request(
373420
method="describe_security_groups",

cli/tests/pcluster/validators/test_all_validators/test_slurm_all_validators_are_called/slurm_2.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ Scheduling:
2020
ComputeResources:
2121
- Name: compute_resource1
2222
InstanceType: c5.xlarge
23+
- Name: compute_resource2
24+
InstanceType: t2.micro
25+
CapacityReservationTarget:
26+
CapacityReservationId: cr-54321fcdbd5971234
2327
SharedStorage:
2428
- MountDir: /my/mount/point2
2529
Name: name1

0 commit comments

Comments
 (0)