Skip to content

Commit 5bc1451

Browse files
authored
Merge branch 'develop' into release-3.14.1
2 parents 06b5b48 + 9457ba4 commit 5bc1451

File tree

7 files changed

+29
-4
lines changed

7 files changed

+29
-4
lines changed

.flake8

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ ignore =
1818
W503,
1919
# N818: exception name should be named with an Error suffix
2020
N818
21+
# B042: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`.
22+
# Affected by false positive, https://github.com/PyCQA/flake8-bugbear/issues/525
23+
B042
2124
exclude =
2225
.tox,
2326
.git,

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG
33

44
This file is used to list changes made in each version of the aws-parallelcluster-node package.
55

6+
3.15.0
7+
------
8+
9+
**CHANGES**
10+
- Direct users to slurm_resume log to see EC2 error codes if no instances are launched.
11+
612
3.14.0
713
------
814

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def read(fname):
3232
"clustermgtd = slurm_plugin.clustermgtd:main",
3333
"computemgtd = slurm_plugin.computemgtd:main",
3434
]
35-
version = "3.14.0"
35+
version = "3.15.0"
3636
requires = ["boto3>=1.7.55", "retrying>=1.3.3"]
3737

3838
setup(

src/slurm_plugin/clustermgtd.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1262,7 +1262,8 @@ def _reset_timeout_expired_compute_resources(
12621262
return
12631263
log.info(
12641264
"The following compute resources are in down state due to insufficient capacity: %s, "
1265-
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired",
1265+
"compute resources will be reset after insufficient capacity timeout (%s seconds) expired. "
1266+
"Check the slurm_resume log for EC2 error codes.",
12661267
self._insufficient_capacity_compute_resources,
12671268
self._config.insufficient_capacity_timeout,
12681269
)

src/slurm_plugin/resume.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,11 @@ def _resume(arg_nodes, resume_config, slurm_resume):
227227
print_with_count(failed_nodes),
228228
)
229229
for error_code, node_list in instance_manager.failed_nodes.items():
230-
_handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes")
230+
_handle_failed_nodes(
231+
node_list,
232+
reason=f"(Code:{error_code})Failure when resuming nodes - "
233+
f"Check the slurm_resume log for EC2 error codes",
234+
)
231235

232236
event_publisher = ClusterEventPublisher.create_with_default_publisher(
233237
event_logger,

tests/slurm_plugin/test_clustermgtd.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3533,6 +3533,13 @@ def test_reset_timeout_expired_compute_resources(
35333533
assert_that(cluster_manager._insufficient_capacity_compute_resources).is_equal_to(
35343534
expected_insufficient_capacity_compute_resources
35353535
)
3536+
3537+
if expected_insufficient_capacity_compute_resources:
3538+
assert (
3539+
"compute resources will be reset after insufficient capacity timeout (20 seconds) expired. "
3540+
"Check the slurm_resume log for EC2 error codes."
3541+
) in caplog.text
3542+
35363543
if expected_power_save_node_list:
35373544
power_save_mock.assert_called_with(
35383545
expected_power_save_node_list, reason="Enabling node since insufficient capacity timeout expired"

tests/slurm_plugin/test_resume.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,11 @@ def test_resume_launch(
448448
if expected_failed_nodes:
449449
for error_code, nodeset in expected_failed_nodes.items():
450450
mock_handle_failed_nodes_calls.append(
451-
call(nodeset, reason=f"(Code:{error_code})Failure when resuming nodes")
451+
call(
452+
nodeset,
453+
reason=f"(Code:{error_code})Failure when resuming nodes - "
454+
f"Check the slurm_resume log for EC2 error codes",
455+
)
452456
)
453457
mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls)
454458
mock_terminate_instances.assert_called_with(ANY, mock_resume_config.terminate_max_batch_size)

0 commit comments

Comments
 (0)