Skip to content

Commit 33d5a02

Browse files
feat: add support to use custom scale errors
1 parent f1d156e commit 33d5a02

File tree

15 files changed

+61
-1
lines changed

15 files changed

+61
-1
lines changed

lambdas/functions/control-plane/src/aws/runners.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,5 @@ export interface RunnerInputParameters {
4444
amiIdSsmParameterName?: string;
4545
tracingEnabled?: boolean;
4646
onDemandFailoverOnError?: string[];
47+
customScaleErrors?: string[];
4748
}

lambdas/functions/control-plane/src/aws/runners.test.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,17 @@ describe('create runner with errors', () => {
418418
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
419419
});
420420

421+
it('test ScaleError with custom scale error.', async () => {
422+
createFleetMockWithErrors(['CustomAWSError']);
423+
424+
await expect(createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] }))).rejects.toBeInstanceOf(ScaleError);
425+
expect(mockEC2Client).toHaveReceivedCommandWith(
426+
CreateFleetCommand,
427+
expectedCreateFleetRequest(defaultExpectedFleetRequestValues),
428+
);
429+
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
430+
});
431+
421432
it('test ScaleError with multiple error.', async () => {
422433
createFleetMockWithErrors(['UnfulfillableCapacity', 'SomeError']);
423434

@@ -638,6 +649,7 @@ interface RunnerConfig {
638649
amiIdSsmParameterName?: string;
639650
tracingEnabled?: boolean;
640651
onDemandFailoverOnError?: string[];
652+
customScaleErrors?: string[];
641653
}
642654

643655
function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -657,6 +669,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
657669
amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
658670
tracingEnabled: runnerConfig.tracingEnabled,
659671
onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
672+
customScaleErrors: runnerConfig.customScaleErrors,
660673
};
661674
}
662675

lambdas/functions/control-plane/src/aws/runners.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ async function processFleetResult(
177177

178178
// Educated guess of errors that would make sense to retry based on the list
179179
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
180-
const scaleErrors = [
180+
const defaultScaleErrors = [
181181
'UnfulfillableCapacity',
182182
'MaxSpotInstanceCountExceeded',
183183
'TargetCapacityLimitExceededException',
@@ -188,6 +188,11 @@ async function processFleetResult(
188188
'InsufficientInstanceCapacity',
189189
];
190190

191+
const scaleErrors =
192+
runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0
193+
? runnerParameters.customScaleErrors
194+
: defaultScaleErrors;
195+
191196
if (
192197
errors.some((e) => runnerParameters.onDemandFailoverOnError?.includes(e)) &&
193198
runnerParameters.ec2instanceCriteria.targetCapacityType === 'spot'

lambdas/functions/control-plane/src/pool/pool.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ export async function adjust(event: PoolEvent): Promise<void> {
4141
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
4242
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
4343
: [];
44+
const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
45+
? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
46+
: [];
4447

4548
const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
4649

@@ -96,6 +99,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
9699
amiIdSsmParameterName,
97100
tracingEnabled,
98101
onDemandFailoverOnError,
102+
customScaleErrors
99103
},
100104
githubInstallationClient,
101105
);

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
9898
subnets: ['subnet-123'],
9999
tracingEnabled: false,
100100
onDemandFailoverOnError: [],
101+
customScaleErrors: [],
101102
};
102103
let expectedRunnerParams: RunnerInputParameters;
103104

@@ -115,6 +116,7 @@ function setDefaults() {
115116
process.env.INSTANCE_TYPES = 'm5.large';
116117
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
117118
process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
119+
process.env.CUSTOM_SCALE_ERRORS = undefined;
118120
}
119121

120122
beforeEach(() => {
@@ -587,6 +589,16 @@ describe('scaleUp with public GH', () => {
587589
});
588590
});
589591

592+
it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
593+
process.env.RUNNER_LABELS = 'label1,label2';
594+
process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
595+
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
596+
expect(createRunner).toBeCalledWith({
597+
...expectedRunnerParams,
598+
customScaleErrors: ['RequestLimitExceeded'],
599+
});
600+
});
601+
590602
it('creates a runner and ensure the group argument is ignored', async () => {
591603
process.env.RUNNER_LABELS = 'label1,label2';
592604
process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED';

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ interface CreateEC2RunnerConfig {
6060
amiIdSsmParameterName?: string;
6161
tracingEnabled?: boolean;
6262
onDemandFailoverOnError?: string[];
63+
customScaleErrors?: string[];
6364
}
6465

6566
function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
@@ -251,6 +252,9 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
251252
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
252253
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
253254
: [];
255+
const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
256+
? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
257+
: [];
254258

255259
if (ephemeralEnabled && payload.eventType !== 'workflow_job') {
256260
logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`);
@@ -335,6 +339,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
335339
amiIdSsmParameterName,
336340
tracingEnabled,
337341
onDemandFailoverOnError,
342+
customScaleErrors,
338343
},
339344
githubInstallationClient,
340345
);

main.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ module "runners" {
191191
enable_jit_config = var.enable_jit_config
192192
enable_job_queued_check = var.enable_job_queued_check
193193
enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors
194+
ccustom_scale_errors = var.custom_scale_errors
194195
disable_runner_autoupdate = var.disable_runner_autoupdate
195196
enable_managed_runner_security_group = var.enable_managed_runner_security_group
196197
enable_runner_detailed_monitoring = var.enable_runner_detailed_monitoring

modules/multi-runner/runners.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ module "runners" {
3535
github_app_parameters = local.github_app_parameters
3636
ebs_optimized = each.value.runner_config.ebs_optimized
3737
enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors
38+
custom_scale_errors = each.value.runner_config.custom_scale_errors
3839
enable_organization_runners = each.value.runner_config.enable_organization_runners
3940
enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners
4041
enable_jit_config = each.value.runner_config.enable_jit_config

modules/multi-runner/variables.tf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ variable "multi_runner_config" {
8484
enable_ephemeral_runners = optional(bool, false)
8585
enable_job_queued_check = optional(bool, null)
8686
enable_on_demand_failover_for_errors = optional(list(string), [])
87+
custom_scale_errors = optional(list(string), [])
8788
enable_organization_runners = optional(bool, false)
8889
enable_runner_binaries_syncer = optional(bool, true)
8990
enable_ssm_on_runners = optional(bool, false)
@@ -193,6 +194,7 @@ variable "multi_runner_config" {
193194
enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once."
194195
enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
195196
enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
197+
custom_scale_errors: "List of custom aws error codes that should trigger retry during scale up."
196198
enable_organization_runners: "Register runners to organization, instead of repo level"
197199
enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
198200
enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."

modules/runners/pool.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ module "pool" {
4242
ephemeral = var.enable_ephemeral_runners
4343
enable_jit_config = var.enable_jit_config
4444
enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors
45+
custom_scale_errors = var.custom_scale_errors
4546
boot_time_in_minutes = var.runner_boot_time_in_minutes
4647
labels = var.runner_labels
4748
launch_template = aws_launch_template.runner

0 commit comments

Comments
 (0)