Skip to content

Commit 74f1c71

Browse files
committed
feat: add event handling strategy in syslog health monitor
Signed-off-by: Tanisha goyal <tanishag@nvidia.com>
1 parent f8b7d1a commit 74f1c71

File tree

20 files changed

+538
-170
lines changed

20 files changed

+538
-170
lines changed

distros/kubernetes/nvsentinel/charts/syslog-health-monitor/templates/_helpers.tpl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ spec:
100100
- "{{ join "," $root.Values.enabledChecks }}"
101101
- "--metadata-path"
102102
- "{{ $root.Values.global.metadataPath }}"
103+
- "--processing-strategy"
104+
- {{ $root.Values.processingStrategy }}
103105
resources:
104106
{{- toYaml $root.Values.resources | nindent 12 }}
105107
ports:

distros/kubernetes/nvsentinel/charts/syslog-health-monitor/values.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,10 @@ tolerations: []
5050
journalHostPath: /var/log
5151

5252
logLevel: info
53+
54+
# Processing strategy for health events
55+
# valid values: EXECUTE_REMEDIATION, STORE_ONLY
56+
# default: EXECUTE_REMEDIATION
57+
# EXECUTE_REMEDIATION: normal behavior; downstream modules may update cluster state.
58+
# STORE_ONLY: observability-only behavior; event should be persisted/exported but should not modify cluster resources (i.e., no node conditions, no quarantine, no drain, no remediation).
59+
processingStrategy: EXECUTE_REMEDIATION

health-monitors/syslog-health-monitor/main.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ var (
6767
"Indicates if this monitor is running in Kata Containers mode (set by DaemonSet variant).")
6868
metadataPath = flag.String("metadata-path", "/var/lib/nvsentinel/gpu_metadata.json",
6969
"Path to GPU metadata JSON file.")
70+
processingStrategyFlag = flag.String("processing-strategy", "EXECUTE_REMEDIATION",
71+
"Event processing strategy: EXECUTE_REMEDIATION or STORE_ONLY")
7072
)
7173

7274
var checks []fd.CheckDefinition
@@ -159,6 +161,15 @@ func run() error {
159161

160162
slog.Info("Creating syslog monitor", "checksCount", len(checks))
161163

164+
value, ok := pb.ProcessingStrategy_value[*processingStrategyFlag]
165+
if !ok {
166+
return fmt.Errorf("unexpected processingStrategy value: %q", *processingStrategyFlag)
167+
}
168+
169+
slog.Info("Event handling strategy configured", "processingStrategy", *processingStrategyFlag)
170+
171+
processingStrategy := pb.ProcessingStrategy(value)
172+
162173
fdHealthMonitor, err := fd.NewSyslogMonitor(
163174
nodeName,
164175
checks,
@@ -169,6 +180,7 @@ func run() error {
169180
*stateFileFlag,
170181
*xidAnalyserEndpoint,
171182
*metadataPath,
183+
processingStrategy,
172184
)
173185
if err != nil {
174186
return fmt.Errorf("error creating syslog health monitor: %w", err)

health-monitors/syslog-health-monitor/pkg/gpufallen/gpufallen_handler.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,17 @@ import (
2828

2929
// NewGPUFallenHandler creates a new GPUFallenHandler instance.
3030
func NewGPUFallenHandler(nodeName, defaultAgentName,
31-
defaultComponentClass, checkName string) (*GPUFallenHandler, error) {
31+
defaultComponentClass, checkName string,
32+
processingStrategy pb.ProcessingStrategy,
33+
) (*GPUFallenHandler, error) {
3234
ctx, cancel := context.WithCancel(context.Background())
3335

3436
h := &GPUFallenHandler{
3537
nodeName: nodeName,
3638
defaultAgentName: defaultAgentName,
3739
defaultComponentClass: defaultComponentClass,
3840
checkName: checkName,
41+
processingStrategy: processingStrategy,
3942
recentXIDs: make(map[string]xidRecord),
4043
xidWindow: 5 * time.Minute, // Remember XIDs for 5 minutes
4144
cancelCleanup: cancel,
@@ -232,6 +235,7 @@ func (h *GPUFallenHandler) createHealthEventFromError(event *gpuFallenErrorEvent
232235
NodeName: h.nodeName,
233236
RecommendedAction: pb.RecommendedAction_RESTART_BM,
234237
ErrorCode: []string{"GPU_FALLEN_OFF_BUS"},
238+
ProcessingStrategy: h.processingStrategy,
235239
}
236240

237241
return &pb.HealthEvents{

health-monitors/syslog-health-monitor/pkg/gpufallen/gpufallen_handler_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ func TestProcessLine(t *testing.T) {
169169
"test-agent",
170170
"GPU",
171171
"test-check",
172+
pb.ProcessingStrategy_STORE_ONLY,
172173
)
173174
require.NoError(t, err)
174175
defer handler.Close()
@@ -181,6 +182,7 @@ func TestProcessLine(t *testing.T) {
181182
if tc.validateEvent != nil {
182183
tc.validateEvent(t, events, tc.message)
183184
}
185+
assert.Equal(t, pb.ProcessingStrategy_STORE_ONLY, events.Events[0].ProcessingStrategy)
184186
} else {
185187
assert.Nil(t, events, "Expected no event to be generated")
186188
}
@@ -196,6 +198,7 @@ func TestXIDTracking(t *testing.T) {
196198
"test-agent",
197199
"GPU",
198200
"test-check",
201+
pb.ProcessingStrategy_EXECUTE_REMEDIATION,
199202
)
200203
require.NoError(t, err)
201204
defer handler.Close() // Cleanup goroutine to prevent leaks
@@ -222,6 +225,7 @@ func TestXIDTracking(t *testing.T) {
222225
"test-agent",
223226
"GPU",
224227
"test-check",
228+
pb.ProcessingStrategy_EXECUTE_REMEDIATION,
225229
)
226230
require.NoError(t, err)
227231
defer handler2.Close()
@@ -234,6 +238,7 @@ func TestXIDTracking(t *testing.T) {
234238
require.NoError(t, err)
235239
require.NotNil(t, events, "Should generate event when no recent XID")
236240
require.Len(t, events.Events, 1)
241+
assert.Equal(t, handler2.processingStrategy, events.Events[0].ProcessingStrategy)
237242
})
238243

239244
t.Run("XID expires after time window", func(t *testing.T) {
@@ -243,6 +248,7 @@ func TestXIDTracking(t *testing.T) {
243248
"test-agent",
244249
"GPU",
245250
"test-check",
251+
pb.ProcessingStrategy_EXECUTE_REMEDIATION,
246252
)
247253
require.NoError(t, err)
248254
defer handler3.Close()
@@ -270,6 +276,7 @@ func TestXIDTracking(t *testing.T) {
270276
"test-agent",
271277
"GPU",
272278
"test-check",
279+
pb.ProcessingStrategy_EXECUTE_REMEDIATION,
273280
)
274281
require.NoError(t, err)
275282
defer handler4.Close()
@@ -298,6 +305,7 @@ func TestXIDTracking(t *testing.T) {
298305
"test-agent",
299306
"GPU",
300307
"test-check",
308+
pb.ProcessingStrategy_EXECUTE_REMEDIATION,
301309
)
302310
require.NoError(t, err)
303311
defer handler5.Close()
@@ -315,6 +323,7 @@ func TestXIDTracking(t *testing.T) {
315323
"test-agent",
316324
"GPU",
317325
"test-check",
326+
pb.ProcessingStrategy_EXECUTE_REMEDIATION,
318327
)
319328
require.NoError(t, err)
320329
defer handler6.Close()
@@ -340,6 +349,7 @@ func TestXIDTracking(t *testing.T) {
340349
"test-agent",
341350
"GPU",
342351
"test-check",
352+
pb.ProcessingStrategy_EXECUTE_REMEDIATION,
343353
)
344354
require.NoError(t, err)
345355
defer handler7.Close()
@@ -379,6 +389,7 @@ func TestXIDTracking(t *testing.T) {
379389
"test-agent",
380390
"GPU",
381391
"test-check",
392+
pb.ProcessingStrategy_EXECUTE_REMEDIATION,
382393
)
383394
require.NoError(t, err)
384395
defer handler8.Close()

health-monitors/syslog-health-monitor/pkg/gpufallen/types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ import (
1919
"regexp"
2020
"sync"
2121
"time"
22+
23+
pb "github.com/nvidia/nvsentinel/data-models/pkg/protos"
2224
)
2325

2426
var (
@@ -47,6 +49,7 @@ type GPUFallenHandler struct {
4749
defaultAgentName string
4850
defaultComponentClass string
4951
checkName string
52+
processingStrategy pb.ProcessingStrategy
5053
mu sync.RWMutex
5154
recentXIDs map[string]xidRecord // pciAddr -> XID record
5255
xidWindow time.Duration // how long to remember XID errors

health-monitors/syslog-health-monitor/pkg/sxid/sxid_handler.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,15 @@ import (
2828
)
2929

3030
func NewSXIDHandler(nodeName, defaultAgentName,
31-
defaultComponentClass, checkName, metadataPath string) (*SXIDHandler, error) {
31+
defaultComponentClass, checkName, metadataPath string,
32+
processingStrategy pb.ProcessingStrategy,
33+
) (*SXIDHandler, error) {
3234
return &SXIDHandler{
3335
nodeName: nodeName,
3436
defaultAgentName: defaultAgentName,
3537
defaultComponentClass: defaultComponentClass,
3638
checkName: checkName,
39+
processingStrategy: processingStrategy,
3740
metadataReader: metadata.NewReader(metadataPath),
3841
}, nil
3942
}
@@ -103,6 +106,7 @@ func (sxidHandler *SXIDHandler) ProcessLine(message string) (*pb.HealthEvents, e
103106
RecommendedAction: errRes,
104107
ErrorCode: []string{fmt.Sprint(sxidErrorEvent.ErrorNum)},
105108
Metadata: metadata,
109+
ProcessingStrategy: sxidHandler.processingStrategy,
106110
}
107111

108112
return &pb.HealthEvents{

health-monitors/syslog-health-monitor/pkg/sxid/sxid_handler_test.go

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,11 @@
1515
package sxid
1616

1717
import (
18+
"os"
19+
"path/filepath"
1820
"testing"
1921

22+
pb "github.com/nvidia/nvsentinel/data-models/pkg/protos"
2023
"github.com/stretchr/testify/assert"
2124
"github.com/stretchr/testify/require"
2225
)
@@ -40,7 +43,7 @@ func TestNewSXIDHandler(t *testing.T) {
4043

4144
for _, tc := range testCases {
4245
t.Run(tc.name, func(t *testing.T) {
43-
handler, err := NewSXIDHandler(tc.nodeName, tc.agentName, tc.componentClass, tc.checkName, "/tmp/metadata.json")
46+
handler, err := NewSXIDHandler(tc.nodeName, tc.agentName, tc.componentClass, tc.checkName, "/tmp/metadata.json", pb.ProcessingStrategy_EXECUTE_REMEDIATION)
4447

4548
require.NoError(t, err)
4649
require.NotNil(t, handler)
@@ -137,6 +140,61 @@ func TestExtractInfoFromNVSwitchErrorMsg(t *testing.T) {
137140
}
138141
}
139142

143+
func TestProcessLineWithValidTopologyFatalSXID(t *testing.T) {
144+
// Create temp metadata file with valid NVSwitch topology
145+
metadataJSON := `{
146+
"version": "1.0",
147+
"timestamp": "2025-01-01T00:00:00Z",
148+
"node_name": "test-node",
149+
"gpus": [
150+
{
151+
"gpu_id": 1,
152+
"uuid": "GPU-aaaabbbb-cccc-dddd-eeee-ffffffffffff",
153+
"pci_address": "0000:18:00.0",
154+
"serial_number": "GPU-SN-002",
155+
"device_name": "NVIDIA H100",
156+
"nvlinks": [
157+
{
158+
"link_id": 2,
159+
"remote_pci_address": "0000:c3:00.0",
160+
"remote_link_id": 28
161+
}
162+
]
163+
}
164+
],
165+
"nvswitches": ["0000:c3:00.0"]
166+
}`
167+
168+
tmpDir := t.TempDir()
169+
metadataPath := filepath.Join(tmpDir, "gpu_metadata.json")
170+
err := os.WriteFile(metadataPath, []byte(metadataJSON), 0o644)
171+
require.NoError(t, err)
172+
173+
handler, err := NewSXIDHandler("test-node", "test-agent", "NVSWITCH", "sxid-check", metadataPath, pb.ProcessingStrategy_STORE_ONLY)
174+
require.NoError(t, err)
175+
176+
message := "[ 1108.858286] nvidia-nvswitch0: SXid (PCI:0000:c3:00.0): 24007, Fatal, Link 28 sourcetrack timeout error (First)"
177+
events, err := handler.ProcessLine(message)
178+
179+
require.NoError(t, err)
180+
require.NotNil(t, events)
181+
require.Len(t, events.Events, 1)
182+
183+
event := events.Events[0]
184+
assert.Equal(t, message, event.Message)
185+
assert.Equal(t, []string{"24007"}, event.ErrorCode)
186+
assert.True(t, event.IsFatal)
187+
assert.False(t, event.IsHealthy)
188+
assert.Equal(t, pb.RecommendedAction_CONTACT_SUPPORT, event.RecommendedAction)
189+
assert.Equal(t, pb.ProcessingStrategy_STORE_ONLY, event.ProcessingStrategy)
190+
191+
// Verify GPU entity
192+
assert.Equal(t, "GPU", event.EntitiesImpacted[3].EntityType)
193+
assert.Equal(t, "1", event.EntitiesImpacted[3].EntityValue)
194+
assert.Equal(t, "GPU_UUID", event.EntitiesImpacted[4].EntityType)
195+
assert.Equal(t, "GPU-aaaabbbb-cccc-dddd-eeee-ffffffffffff", event.EntitiesImpacted[4].EntityValue)
196+
}
197+
140198
func TestProcessLine(t *testing.T) {
141199
testCases := []struct {
142200
name string
@@ -166,7 +224,7 @@ func TestProcessLine(t *testing.T) {
166224

167225
for _, tc := range testCases {
168226
t.Run(tc.name, func(t *testing.T) {
169-
handler, err := NewSXIDHandler("test-node", "test-agent", "NVSWITCH", "sxid-check", "/tmp/metadata.json")
227+
handler, err := NewSXIDHandler("test-node", "test-agent", "NVSWITCH", "sxid-check", "/tmp/metadata.json", pb.ProcessingStrategy_EXECUTE_REMEDIATION)
170228
require.NoError(t, err)
171229

172230
events, err := handler.ProcessLine(tc.message)
@@ -182,6 +240,7 @@ func TestProcessLine(t *testing.T) {
182240
event := events.Events[0]
183241
assert.Equal(t, tc.message, event.Message)
184242
assert.NotEmpty(t, event.Metadata)
243+
assert.Equal(t, pb.ProcessingStrategy_EXECUTE_REMEDIATION, event.ProcessingStrategy)
185244
} else {
186245
assert.Nil(t, events)
187246
}

health-monitors/syslog-health-monitor/pkg/sxid/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package sxid
1717
import (
1818
"regexp"
1919

20+
pb "github.com/nvidia/nvsentinel/data-models/pkg/protos"
2021
"github.com/nvidia/nvsentinel/health-monitors/syslog-health-monitor/pkg/metadata"
2122
)
2223

@@ -30,6 +31,7 @@ type SXIDHandler struct {
3031
defaultAgentName string
3132
defaultComponentClass string
3233
checkName string
34+
processingStrategy pb.ProcessingStrategy
3335
metadataReader *metadata.Reader
3436
}
3537

health-monitors/syslog-health-monitor/pkg/syslog-monitor/syslogmonitor.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,12 @@ func NewSyslogMonitor(
4848
stateFilePath string,
4949
xidAnalyserEndpoint string,
5050
metadataPath string,
51+
processingStrategy pb.ProcessingStrategy,
5152
) (*SyslogMonitor, error) {
5253
return NewSyslogMonitorWithFactory(nodeName, checks, pcClient, defaultAgentName,
5354
defaultComponentClass, pollingInterval, stateFilePath, GetDefaultJournalFactory(),
5455
xidAnalyserEndpoint, metadataPath,
56+
processingStrategy,
5557
)
5658
}
5759

@@ -69,6 +71,7 @@ func NewSyslogMonitorWithFactory(
6971
journalFactory JournalFactory,
7072
xidAnalyserEndpoint string,
7173
metadataPath string,
74+
processingStrategy pb.ProcessingStrategy,
7275
) (*SyslogMonitor, error) {
7376
// Load state from file
7477
state, err := loadState(stateFilePath)
@@ -90,6 +93,7 @@ func NewSyslogMonitorWithFactory(
9093
pcClient: pcClient,
9194
defaultAgentName: defaultAgentName,
9295
defaultComponentClass: defaultComponentClass,
96+
processingStrategy: processingStrategy,
9397
pollingInterval: pollingInterval,
9498
checkLastCursors: state.CheckLastCursors,
9599
journalFactory: journalFactory,
@@ -103,7 +107,7 @@ func NewSyslogMonitorWithFactory(
103107
switch check.Name {
104108
case XIDErrorCheck:
105109
xidHandler, err := xid.NewXIDHandler(nodeName,
106-
defaultAgentName, defaultComponentClass, check.Name, xidAnalyserEndpoint, metadataPath)
110+
defaultAgentName, defaultComponentClass, check.Name, xidAnalyserEndpoint, metadataPath, processingStrategy)
107111
if err != nil {
108112
slog.Error("Error initializing XID handler", "error", err.Error())
109113
return nil, fmt.Errorf("failed to initialize XID handler: %w", err)
@@ -113,7 +117,7 @@ func NewSyslogMonitorWithFactory(
113117

114118
case SXIDErrorCheck:
115119
sxidHandler, err := sxid.NewSXIDHandler(
116-
nodeName, defaultAgentName, defaultComponentClass, check.Name, metadataPath)
120+
nodeName, defaultAgentName, defaultComponentClass, check.Name, metadataPath, processingStrategy)
117121
if err != nil {
118122
slog.Error("Error initializing SXID handler", "error", err.Error())
119123
return nil, fmt.Errorf("failed to initialize SXID handler: %w", err)
@@ -123,7 +127,7 @@ func NewSyslogMonitorWithFactory(
123127

124128
case GPUFallenOffCheck:
125129
gpuFallenHandler, err := gpufallen.NewGPUFallenHandler(
126-
nodeName, defaultAgentName, defaultComponentClass, check.Name)
130+
nodeName, defaultAgentName, defaultComponentClass, check.Name, processingStrategy)
127131
if err != nil {
128132
slog.Error("Error initializing GPU Fallen Off handler", "error", err.Error())
129133
return nil, fmt.Errorf("failed to initialize GPU Fallen Off handler: %w", err)
@@ -806,6 +810,7 @@ func (sm *SyslogMonitor) prepareHealthEventWithAction(
806810
IsHealthy: isHealthy,
807811
NodeName: sm.nodeName,
808812
RecommendedAction: errRes.RecommendedAction,
813+
ProcessingStrategy: sm.processingStrategy,
809814
}
810815

811816
return &pb.HealthEvents{

0 commit comments

Comments
 (0)