Skip to content

Commit cac4deb

Browse files
Merge branch 'main' into 390-event-handling-in-gpu-monitor
2 parents 9424707 + c1aa7a5 commit cac4deb

File tree

7 files changed

+26
-10
lines changed

7 files changed

+26
-10
lines changed

distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,11 @@ maintenance:
9090
apiVersion: janitor.dgxc.nvidia.com/v1alpha1
9191
kind: RebootNode
9292
metadata:
93-
name: maintenance-{{ .NodeName }}-{{ .HealthEventID }}
93+
name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }}
9494
labels:
9595
app.kubernetes.io/managed-by: nvsentinel
9696
spec:
97-
nodeName: {{ .NodeName }}
97+
nodeName: {{ .HealthEvent.NodeName }}
9898
force: false
9999
# Additional template examples:
100100
# "namespaced-restart.yaml": |

distros/kubernetes/nvsentinel/values-tilt.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,9 +266,9 @@ fault-remediation:
266266
apiVersion: {{ .ApiGroup }}/{{ .Version }}
267267
kind: RebootNode
268268
metadata:
269-
name: maintenance-{{ .NodeName }}-{{ .HealthEventID }}
269+
name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }}
270270
spec:
271-
nodeName: {{ .NodeName }}
271+
nodeName: {{ .HealthEvent.NodeName }}
272272
273273
logCollector:
274274
enabled: true

fault-remediation/pkg/common/equivalence_groups.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,24 @@ import (
2121
// RemediationEquivalenceGroups defines groups of remediation actions that are considered
2222
// to have the same operational effect. This is used to prevent multiple, similar remediations
2323
// (like various forms of reboots) from occurring in rapid succession.
24-
// TODO: Fix this multiple mappings as part of https://jirasw.nvidia.com/browse/KACE-1736
2524
var RemediationEquivalenceGroups = map[string][]protos.RecommendedAction{
2625
"restart": {
2726
protos.RecommendedAction_COMPONENT_RESET,
2827
protos.RecommendedAction_RESTART_VM,
2928
protos.RecommendedAction_RESTART_BM,
3029
},
30+
"fieldiag": {
31+
protos.RecommendedAction_RUN_FIELDDIAG,
32+
},
33+
"dcgmeud": {
34+
protos.RecommendedAction_RUN_DCGMEUD,
35+
},
36+
"support": {
37+
protos.RecommendedAction_CONTACT_SUPPORT,
38+
},
39+
"replace": {
40+
protos.RecommendedAction_REPLACE_VM,
41+
},
3142
}
3243

3344
// GetRemediationGroupForAction returns the equivalence group key for a given action.

fault-remediation/pkg/common/equivalence_groups_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ func TestGetRemediationGroupForAction(t *testing.T) {
4444
expectedGroup: "restart",
4545
},
4646
{
47-
name: "CONTACT_SUPPORT returns empty string (not in any group)",
47+
name: "CONTACT_SUPPORT returns support",
4848
action: protos.RecommendedAction_CONTACT_SUPPORT,
49-
expectedGroup: "",
49+
expectedGroup: "support",
5050
},
5151
{
5252
name: "NONE returns empty string (not in any group)",

fault-remediation/pkg/reconciler/reconciler.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,8 @@ func (r *FaultRemediationReconciler) shouldSkipEvent(ctx context.Context,
150150
return true
151151
}
152152

153-
if common.GetRemediationGroupForAction(action) != "" {
153+
_, exists := r.config.RemediationClient.GetConfig().RemediationActions[action.String()]
154+
if common.GetRemediationGroupForAction(action) != "" && exists {
154155
return false
155156
}
156157

fault-remediation/pkg/reconciler/remediation.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ type TemplateData struct {
8080
RecommendedAction protos.RecommendedAction
8181
RecommendedActionName string
8282

83+
HealthEvent *protos.HealthEvent
84+
8385
// CRD routing metadata (populated from MaintenanceResource)
8486
ApiGroup string
8587
Version string
@@ -271,6 +273,8 @@ func (c *FaultRemediationClient) CreateMaintenanceResource(
271273
RecommendedAction: healthEvent.RecommendedAction,
272274
RecommendedActionName: recommendedActionName,
273275

276+
HealthEvent: healthEvent,
277+
274278
ApiGroup: maintenanceResource.ApiGroup,
275279
Version: maintenanceResource.Version,
276280
Kind: maintenanceResource.Kind,

fault-remediation/pkg/reconciler/remediation_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -444,9 +444,9 @@ func TestCreateRebootNodeResource(t *testing.T) {
444444
tmpl, err := tmpl.Parse(`apiVersion: {{.ApiGroup}}/{{.Version}}
445445
kind: RebootNode
446446
metadata:
447-
name: maintenance-{{.NodeName}}-{{.HealthEventID}}
447+
name: maintenance-{{.HealthEvent.NodeName}}-{{.HealthEventID}}
448448
spec:
449-
nodeName: {{.NodeName}}`)
449+
nodeName: {{.HealthEvent.NodeName}}`)
450450
assert.NoError(t, err)
451451

452452
// Create K8sClient with mock

0 commit comments

Comments
 (0)