Skip to content

Commit 78350c2

Browse files
committed
fix force restart sequence
1 parent 16e2f81 commit 78350c2

1 file changed

Lines changed: 123 additions & 29 deletions

File tree

rla/internal/task/operationrules/resolver_defaults.go

Lines changed: 123 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -262,11 +262,14 @@ func buildPowerOffRule() *OperationRule {
262262
}
263263
}
264264

265-
// buildRestartRule creates the hardcoded default rule for restart operations
265+
// buildRestartRule creates the hardcoded default rule for graceful restart.
266+
// Each stage explicitly specifies the power operation to avoid inheriting the
267+
// composite "restart" operation from the task context (which would send
268+
// BMC GRACEFUL_RESTART — an atomic off→on — instead of separate off/on).
266269
func buildRestartRule() *OperationRule {
267270
return &OperationRule{
268271
Name: "Hardcoded Default Restart",
269-
Description: "Composite rule: power off all components then power on",
272+
Description: "Composite rule: graceful power off all components then power on",
270273
OperationType: common.TaskTypePowerControl,
271274
OperationCode: SequenceRestart,
272275
RuleDefinition: RuleDefinition{
@@ -276,7 +279,7 @@ func buildRestartRule() *OperationRule {
276279
{
277280
ComponentType: devicetypes.ComponentTypeCompute,
278281
Stage: 1,
279-
MaxParallel: 0, // All components together (legacy behavior)
282+
MaxParallel: 0,
280283
Timeout: 20 * time.Minute,
281284
RetryPolicy: &RetryPolicy{
282285
MaxAttempts: 3,
@@ -285,10 +288,12 @@ func buildRestartRule() *OperationRule {
285288
},
286289
MainOperation: ActionConfig{
287290
Name: ActionPowerControl,
291+
Parameters: map[string]any{
292+
ParamOperation: "power_off",
293+
},
288294
},
289295
PostOperation: []ActionConfig{
290296
{
291-
// Verify power status after operation
292297
Name: ActionVerifyPowerStatus,
293298
Timeout: 3 * time.Minute,
294299
PollInterval: 10 * time.Second,
@@ -301,7 +306,7 @@ func buildRestartRule() *OperationRule {
301306
{
302307
ComponentType: devicetypes.ComponentTypeNVLSwitch,
303308
Stage: 2,
304-
MaxParallel: 0, // All components together (legacy behavior)
309+
MaxParallel: 0,
305310
Timeout: 15 * time.Minute,
306311
RetryPolicy: &RetryPolicy{
307312
MaxAttempts: 3,
@@ -310,10 +315,12 @@ func buildRestartRule() *OperationRule {
310315
},
311316
MainOperation: ActionConfig{
312317
Name: ActionPowerControl,
318+
Parameters: map[string]any{
319+
ParamOperation: "power_off",
320+
},
313321
},
314322
PostOperation: []ActionConfig{
315323
{
316-
// Verify power status after operation
317324
Name: ActionVerifyPowerStatus,
318325
Timeout: 3 * time.Minute,
319326
PollInterval: 10 * time.Second,
@@ -326,7 +333,7 @@ func buildRestartRule() *OperationRule {
326333
{
327334
ComponentType: devicetypes.ComponentTypePowerShelf,
328335
Stage: 3,
329-
MaxParallel: 0, // All components together (legacy behavior)
336+
MaxParallel: 0,
330337
Timeout: 10 * time.Minute,
331338
RetryPolicy: &RetryPolicy{
332339
MaxAttempts: 3,
@@ -335,10 +342,12 @@ func buildRestartRule() *OperationRule {
335342
},
336343
MainOperation: ActionConfig{
337344
Name: ActionPowerControl,
345+
Parameters: map[string]any{
346+
ParamOperation: "power_off",
347+
},
338348
},
339349
PostOperation: []ActionConfig{
340350
{
341-
// Verify power status after operation
342351
Name: ActionVerifyPowerStatus,
343352
Timeout: 3 * time.Minute,
344353
PollInterval: 10 * time.Second,
@@ -352,7 +361,7 @@ func buildRestartRule() *OperationRule {
352361
{
353362
ComponentType: devicetypes.ComponentTypePowerShelf,
354363
Stage: 4,
355-
MaxParallel: 0, // All components together (legacy behavior)
364+
MaxParallel: 0,
356365
Timeout: 10 * time.Minute,
357366
RetryPolicy: &RetryPolicy{
358367
MaxAttempts: 3,
@@ -361,10 +370,12 @@ func buildRestartRule() *OperationRule {
361370
},
362371
MainOperation: ActionConfig{
363372
Name: ActionPowerControl,
373+
Parameters: map[string]any{
374+
ParamOperation: "power_on",
375+
},
364376
},
365377
PostOperation: []ActionConfig{
366378
{
367-
// Verify power status after operation
368379
Name: ActionVerifyPowerStatus,
369380
Timeout: 3 * time.Minute,
370381
PollInterval: 10 * time.Second,
@@ -373,8 +384,6 @@ func buildRestartRule() *OperationRule {
373384
},
374385
},
375386
{
376-
// Wait for downstream components to become
377-
// reachable
378387
Name: ActionVerifyReachability,
379388
Timeout: 3 * time.Minute,
380389
PollInterval: 10 * time.Second,
@@ -390,7 +399,7 @@ func buildRestartRule() *OperationRule {
390399
{
391400
ComponentType: devicetypes.ComponentTypeNVLSwitch,
392401
Stage: 5,
393-
MaxParallel: 0, // All components together (legacy behavior)
402+
MaxParallel: 0,
394403
Timeout: 15 * time.Minute,
395404
RetryPolicy: &RetryPolicy{
396405
MaxAttempts: 3,
@@ -399,10 +408,12 @@ func buildRestartRule() *OperationRule {
399408
},
400409
MainOperation: ActionConfig{
401410
Name: ActionPowerControl,
411+
Parameters: map[string]any{
412+
ParamOperation: "power_on",
413+
},
402414
},
403415
PostOperation: []ActionConfig{
404416
{
405-
// Verify power status after operation
406417
Name: ActionVerifyPowerStatus,
407418
Timeout: 3 * time.Minute,
408419
PollInterval: 10 * time.Second,
@@ -415,7 +426,7 @@ func buildRestartRule() *OperationRule {
415426
{
416427
ComponentType: devicetypes.ComponentTypeCompute,
417428
Stage: 6,
418-
MaxParallel: 0, // All components together (legacy behavior)
429+
MaxParallel: 0,
419430
Timeout: 20 * time.Minute,
420431
RetryPolicy: &RetryPolicy{
421432
MaxAttempts: 3,
@@ -424,10 +435,12 @@ func buildRestartRule() *OperationRule {
424435
},
425436
MainOperation: ActionConfig{
426437
Name: ActionPowerControl,
438+
Parameters: map[string]any{
439+
ParamOperation: "power_on",
440+
},
427441
},
428442
PostOperation: []ActionConfig{
429443
{
430-
// Verify power status after operation
431444
Name: ActionVerifyPowerStatus,
432445
Timeout: 3 * time.Minute,
433446
PollInterval: 10 * time.Second,
@@ -987,18 +1000,21 @@ func buildIngestRule() *OperationRule {
9871000
}
9881001
}
9891002

990-
// buildForceRestartRule creates the hardcoded default rule for
991-
// forced restart operations (no verification)
1003+
// buildForceRestartRule creates the hardcoded default rule for forced restart
1004+
// operations. Skips per-stage verification for speed but verifies the "off"
1005+
// state before proceeding to power on, ensuring a real power cycle occurs.
9921006
func buildForceRestartRule() *OperationRule {
9931007
return &OperationRule{
9941008
Name: "Hardcoded Default Force Restart",
995-
Description: "Forced restart: power off then on (no verification)",
1009+
Description: "Forced restart: power off, verify off, then power on",
9961010
OperationType: common.TaskTypePowerControl,
9971011
OperationCode: SequenceForceRestart,
9981012
RuleDefinition: RuleDefinition{
9991013
Version: CurrentRuleDefinitionVersion,
10001014
Steps: []SequenceStep{
10011015
// === Power Off Sequence (Stages 1-3) ===
1016+
// Explicit force_power_off to avoid sending BMC FORCE_RESTART
1017+
// (which is an atomic off→on cycle, not just off).
10021018
{
10031019
ComponentType: devicetypes.ComponentTypeCompute,
10041020
Stage: 1,
@@ -1011,6 +1027,9 @@ func buildForceRestartRule() *OperationRule {
10111027
},
10121028
MainOperation: ActionConfig{
10131029
Name: ActionPowerControl,
1030+
Parameters: map[string]any{
1031+
ParamOperation: "force_power_off",
1032+
},
10141033
},
10151034
PostOperation: []ActionConfig{
10161035
{
@@ -1033,6 +1052,9 @@ func buildForceRestartRule() *OperationRule {
10331052
},
10341053
MainOperation: ActionConfig{
10351054
Name: ActionPowerControl,
1055+
Parameters: map[string]any{
1056+
ParamOperation: "force_power_off",
1057+
},
10361058
},
10371059
PostOperation: []ActionConfig{
10381060
{
@@ -1055,22 +1077,86 @@ func buildForceRestartRule() *OperationRule {
10551077
},
10561078
MainOperation: ActionConfig{
10571079
Name: ActionPowerControl,
1080+
Parameters: map[string]any{
1081+
ParamOperation: "force_power_off",
1082+
},
10581083
},
10591084
PostOperation: []ActionConfig{
10601085
{
1061-
// Brief pause between off and on
10621086
Name: ActionSleep,
10631087
Parameters: map[string]any{
10641088
ParamDuration: 5 * time.Second,
10651089
},
10661090
},
10671091
},
10681092
},
1069-
// === Power On Sequence (Stages 4-6) ===
1093+
// === Verify Off Stage (Stage 4) ===
1094+
// Confirm all components are actually off before powering
1095+
// back on. Without this, a silent power-off failure would
1096+
// result in a "successful restart" that never power-cycled.
10701097
{
10711098
ComponentType: devicetypes.ComponentTypePowerShelf,
10721099
Stage: 4,
10731100
MaxParallel: 0,
1101+
Timeout: 2 * time.Minute,
1102+
RetryPolicy: &RetryPolicy{
1103+
MaxAttempts: 2,
1104+
InitialInterval: 5 * time.Second,
1105+
BackoffCoefficient: 1.5,
1106+
},
1107+
MainOperation: ActionConfig{
1108+
Name: ActionVerifyPowerStatus,
1109+
Timeout: 1 * time.Minute,
1110+
PollInterval: 5 * time.Second,
1111+
Parameters: map[string]any{
1112+
ParamExpectedStatus: "off",
1113+
},
1114+
},
1115+
},
1116+
{
1117+
ComponentType: devicetypes.ComponentTypeNVLSwitch,
1118+
Stage: 4, // Parallel with PowerShelf
1119+
MaxParallel: 0,
1120+
Timeout: 2 * time.Minute,
1121+
RetryPolicy: &RetryPolicy{
1122+
MaxAttempts: 2,
1123+
InitialInterval: 5 * time.Second,
1124+
BackoffCoefficient: 1.5,
1125+
},
1126+
MainOperation: ActionConfig{
1127+
Name: ActionVerifyPowerStatus,
1128+
Timeout: 1 * time.Minute,
1129+
PollInterval: 5 * time.Second,
1130+
Parameters: map[string]any{
1131+
ParamExpectedStatus: "off",
1132+
},
1133+
},
1134+
},
1135+
{
1136+
ComponentType: devicetypes.ComponentTypeCompute,
1137+
Stage: 4, // Parallel with others
1138+
MaxParallel: 0,
1139+
Timeout: 2 * time.Minute,
1140+
RetryPolicy: &RetryPolicy{
1141+
MaxAttempts: 2,
1142+
InitialInterval: 5 * time.Second,
1143+
BackoffCoefficient: 1.5,
1144+
},
1145+
MainOperation: ActionConfig{
1146+
Name: ActionVerifyPowerStatus,
1147+
Timeout: 1 * time.Minute,
1148+
PollInterval: 5 * time.Second,
1149+
Parameters: map[string]any{
1150+
ParamExpectedStatus: "off",
1151+
},
1152+
},
1153+
},
1154+
// === Power On Sequence (Stages 5-7) ===
1155+
// Explicit force_power_on to match the force semantics.
1156+
{
1157+
ComponentType: devicetypes.ComponentTypePowerShelf,
1158+
Stage: 5,
1159+
MaxParallel: 0,
10741160
Timeout: 10 * time.Minute,
10751161
RetryPolicy: &RetryPolicy{
10761162
MaxAttempts: 3,
@@ -1079,6 +1165,9 @@ func buildForceRestartRule() *OperationRule {
10791165
},
10801166
MainOperation: ActionConfig{
10811167
Name: ActionPowerControl,
1168+
Parameters: map[string]any{
1169+
ParamOperation: "force_power_on",
1170+
},
10821171
},
10831172
PostOperation: []ActionConfig{
10841173
{
@@ -1091,7 +1180,7 @@ func buildForceRestartRule() *OperationRule {
10911180
},
10921181
{
10931182
ComponentType: devicetypes.ComponentTypeNVLSwitch,
1094-
Stage: 5,
1183+
Stage: 6,
10951184
MaxParallel: 0,
10961185
Timeout: 15 * time.Minute,
10971186
RetryPolicy: &RetryPolicy{
@@ -1101,6 +1190,9 @@ func buildForceRestartRule() *OperationRule {
11011190
},
11021191
MainOperation: ActionConfig{
11031192
Name: ActionPowerControl,
1193+
Parameters: map[string]any{
1194+
ParamOperation: "force_power_on",
1195+
},
11041196
},
11051197
PostOperation: []ActionConfig{
11061198
{
@@ -1113,7 +1205,7 @@ func buildForceRestartRule() *OperationRule {
11131205
},
11141206
{
11151207
ComponentType: devicetypes.ComponentTypeCompute,
1116-
Stage: 6,
1208+
Stage: 7,
11171209
MaxParallel: 0,
11181210
Timeout: 20 * time.Minute,
11191211
RetryPolicy: &RetryPolicy{
@@ -1123,22 +1215,24 @@ func buildForceRestartRule() *OperationRule {
11231215
},
11241216
MainOperation: ActionConfig{
11251217
Name: ActionPowerControl,
1218+
Parameters: map[string]any{
1219+
ParamOperation: "force_power_on",
1220+
},
11261221
},
11271222
PostOperation: []ActionConfig{
11281223
{
1129-
// Brief settle time before final verification
11301224
Name: ActionSleep,
11311225
Parameters: map[string]any{
11321226
ParamDuration: 10 * time.Second,
11331227
},
11341228
},
11351229
},
11361230
},
1137-
// === Final Verification Stage (Stage 7) ===
1138-
// Verify all components in parallel
1231+
// === Final Verification Stage (Stage 8) ===
1232+
// Verify all components are back on
11391233
{
11401234
ComponentType: devicetypes.ComponentTypePowerShelf,
1141-
Stage: 7,
1235+
Stage: 8,
11421236
MaxParallel: 0,
11431237
Timeout: 2 * time.Minute,
11441238
RetryPolicy: &RetryPolicy{
@@ -1157,7 +1251,7 @@ func buildForceRestartRule() *OperationRule {
11571251
},
11581252
{
11591253
ComponentType: devicetypes.ComponentTypeNVLSwitch,
1160-
Stage: 7, // Parallel with PowerShelf
1254+
Stage: 8, // Parallel with PowerShelf
11611255
MaxParallel: 0,
11621256
Timeout: 2 * time.Minute,
11631257
RetryPolicy: &RetryPolicy{
@@ -1176,7 +1270,7 @@ func buildForceRestartRule() *OperationRule {
11761270
},
11771271
{
11781272
ComponentType: devicetypes.ComponentTypeCompute,
1179-
Stage: 7, // Parallel with others
1273+
Stage: 8, // Parallel with others
11801274
MaxParallel: 0,
11811275
Timeout: 2 * time.Minute,
11821276
RetryPolicy: &RetryPolicy{

0 commit comments

Comments
 (0)