Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ jobs:
kubectl get all --all-namespaces > /tmp/debug-artifacts/all-resources.yaml || true
kubectl get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/all-events.yaml || true
kubectl get pods --all-namespaces -o yaml > /tmp/debug-artifacts/all-pods.yaml || true
kubectl get jobs --all-namespaces -o yaml > /tmp/debug-artifacts/all-jobs.yaml || true
kubectl logs --all-namespaces --all-containers=true --tail=500 > /tmp/debug-artifacts/all-logs.txt || true
docker images > /tmp/debug-artifacts/docker-images.txt || true
df -h > /tmp/debug-artifacts/disk-usage.txt || true
Expand Down
25 changes: 1 addition & 24 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,30 +140,7 @@ dist/
### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# AWS User-specific
.idea/**/aws.xml

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries
.idea/

# CMake
cmake-build-*/
Expand Down
2 changes: 1 addition & 1 deletion commons/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -207,4 +207,4 @@ sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxO
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 h1:JrhdFMqOd/+3ByqlP2I45kTOZmTRLBUm5pvRjeheg7E=
sigs.k8s.io/structured-merge-diff/v6 v6.3.1/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ kind: Job
metadata:
generateName: {{ include "fault-remediation.fullname" . }}-log-collector-job-
namespace: {{ .Release.Namespace }}
labels:
app: log-collector
spec:
ttlSecondsAfterFinished: 3600
template:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Collect all unique API groups and their associated resources from multi-template
- list
- update
- patch
- watch
- apiGroups:
- ""
resources:
Expand All @@ -82,6 +83,7 @@ Collect all unique API groups and their associated resources from multi-template
verbs:
- create
- delete
- update
- get
- list
- watch
Original file line number Diff line number Diff line change
Expand Up @@ -87,29 +87,24 @@ spec:
args:
- "--dry-run={{ ((.Values.global).dryRun) | default false }}"
- "--enable-log-collector={{ .Values.logCollector.enabled }}"
{{- if (.Values.global).ctrlRuntimeEnabled }}
- "--controller-runtime=true"
- "--leader-elect=true"
{{- end }}
ports:
- name: metrics
containerPort: {{ ((.Values.global).metricsPort) | default 2112 }}
{{- if (.Values.global).ctrlRuntimeEnabled }}
- name: health
containerPort: {{ ((.Values.global).healthPort) | default 9440 }}
{{- end }}
livenessProbe:
httpGet:
path: /healthz
port: {{ ternary "health" "metrics" (default false (.Values.global).ctrlRuntimeEnabled) }}
port: health
initialDelaySeconds: 15
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: {{ ternary "/readyz" "/healthz" (default false (.Values.global).ctrlRuntimeEnabled) }}
port: {{ ternary "health" "metrics" (default false (.Values.global).ctrlRuntimeEnabled) }}
path: "/readyz"
port: "health"
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 3
Expand Down
4 changes: 0 additions & 4 deletions distros/kubernetes/nvsentinel/values-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,6 @@ global:
# Prometheus should scrape these endpoints to monitor NVSentinel
healthPort: 9440

# Whether or not to run applicable controllers using ctrl-runtime
# this is still experimental
ctrlRuntimeEnabled: false

# Dry-run mode - when enabled, all actions are logged but not executed
# Useful for:
# - Testing configuration changes safely
Expand Down
3 changes: 0 additions & 3 deletions distros/kubernetes/nvsentinel/values-tilt-mongodb.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@ global:
# Keep legacy resource naming for Tilt compatibility
useLegacyResourceNames: true

# Use ctrl runtime in mongo and leave postgres with legacy so both get test coverage
ctrlRuntimeEnabled: true

# Enable certificate rotation for testing hot-reload functionality
certificateRotationEnabled: true

Expand Down
2 changes: 0 additions & 2 deletions distros/kubernetes/nvsentinel/values-tilt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ global:
maxAgeDays: 7
compress: true

ctrlRuntimeEnabled: true

nodeSelector: {}

tolerations:
Expand Down
1 change: 0 additions & 1 deletion distros/kubernetes/nvsentinel/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

global:
dryRun: false
ctrlRuntimeEnabled: false
image:
tag: "main"
initContainerImage:
Expand Down
2 changes: 1 addition & 1 deletion fault-remediation/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ require (
github.com/prometheus/client_golang v1.23.2
github.com/prometheus/client_model v0.6.2
github.com/stretchr/testify v1.11.1
golang.org/x/sync v0.19.0
k8s.io/api v0.35.0
k8s.io/apimachinery v0.35.0
k8s.io/client-go v0.35.0
Expand Down Expand Up @@ -73,6 +72,7 @@ require (
golang.org/x/crypto v0.45.0 // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/oauth2 v0.33.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.38.0 // indirect
golang.org/x/term v0.37.0 // indirect
golang.org/x/text v0.31.0 // indirect
Expand Down
133 changes: 39 additions & 94 deletions fault-remediation/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,34 @@ package main

import (
"context"
"errors"
"flag"
"fmt"
"log/slog"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"

"golang.org/x/sync/errgroup"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"

"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"

"github.com/nvidia/nvsentinel/commons/pkg/auditlogger"
"github.com/nvidia/nvsentinel/commons/pkg/logger"
"github.com/nvidia/nvsentinel/commons/pkg/server"
"github.com/nvidia/nvsentinel/fault-remediation/pkg/initializer"
)

func init() {
utilruntime.Must(corev1.AddToScheme(scheme))
utilruntime.Must(batchv1.AddToScheme(scheme))
}

var (
scheme = runtime.NewScheme()

Expand All @@ -49,14 +54,12 @@ var (

// These variables are populated by parsing flags
enableLeaderElection bool
enableControllerRuntime bool
leaderElectionLeaseDuration time.Duration
leaderElectionRenewDeadline time.Duration
leaderElectionRetryPeriod time.Duration
leaderElectionNamespace string
metricsAddr string
healthAddr string
kubeconfigPath string
tomlConfigPath string
dryRun bool
enableLogCollector bool
Expand Down Expand Up @@ -88,96 +91,27 @@ func main() {
func run() error {
parseFlags()

if !enableControllerRuntime && enableLeaderElection {
return errors.New("leader-election requires controller-runtime")
}

ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()

params := initializer.InitializationParams{
KubeconfigPath: kubeconfigPath,
TomlConfigPath: tomlConfigPath,
DryRun: dryRun,
EnableLogCollector: enableLogCollector,
}

components, err := initializer.InitializeAll(ctx, params)
err := setupCtrlRuntimeManagement(ctx)
if err != nil {
return fmt.Errorf("initialization failed: %w", err)
}

reconciler := components.FaultRemediationReconciler

defer func() {
if err := reconciler.CloseAll(ctx); err != nil {
slog.Error("failed to close datastore components", "error", err)
}
}()

if enableControllerRuntime {
err = setupCtrlRuntimeManagement(ctx, components)
if err != nil {
return err
}
} else {
err = setupNonCtrlRuntimeManaged(ctx, components)
if err != nil {
return err
}
return err
}

return nil
}

func setupNonCtrlRuntimeManaged(ctx context.Context, components *initializer.Components) error {
slog.Info("Running without controller runtime management")

metricsAddr = strings.TrimPrefix(metricsAddr, ":")

portInt, err := strconv.Atoi(metricsAddr)
if err != nil {
return fmt.Errorf("invalid metrics port: %w", err)
}

srv := server.NewServer(
server.WithPort(portInt),
server.WithPrometheusMetricsCtrlRuntime(),
server.WithSimpleHealth(),
)

g, gCtx := errgroup.WithContext(ctx)

g.Go(func() error {
slog.Info("Starting metrics server", "port", portInt)

if err := srv.Serve(gCtx); err != nil {
slog.Error("Metrics server failed - continuing without metrics", "error", err)
}

return nil
})

g.Go(func() error {
components.FaultRemediationReconciler.StartWatcherStream(gCtx)

slog.Info("Listening for events on the channel...")

for event := range components.FaultRemediationReconciler.Watcher.Events() {
slog.Info("Event received", "event", event)
_, _ = components.FaultRemediationReconciler.Reconcile(gCtx, &event)
}
func setupCtrlRuntimeManagement(ctx context.Context) error {
slog.Info("Running in controller runtime managed mode")

return nil
cfg := ctrl.GetConfigOrDie()
cfg.Wrap(func(rt http.RoundTripper) http.RoundTripper {
return auditlogger.NewAuditingRoundTripper(rt)
})

return g.Wait()
}

func setupCtrlRuntimeManagement(ctx context.Context, components *initializer.Components) error {
slog.Info("Running in controller runtime managed mode")

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
//TODO: setup informers for node and job
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

TODO comment should reference an issue.

Per coding guidelines, TODO comments should reference a GitHub issue for tracking.

Based on coding guidelines.

🤖 Prompt for AI Agents
In @fault-remediation/main.go at line 111, Replace the TODO comment "//TODO:
setup informers for node and job" with a TODO that references the tracking
GitHub issue (e.g., "// TODO(#<issue-number>): setup informers for node and
job") so the work is traceable; update the exact comment string in
fault-remediation/main.go to include the issue number and a short owner or
milestone if applicable.

mgr, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{
BindAddress: metricsAddr,
Expand Down Expand Up @@ -205,6 +139,26 @@ func setupCtrlRuntimeManagement(ctx context.Context, components *initializer.Com
return err
}

params := initializer.InitializationParams{
TomlConfigPath: tomlConfigPath,
DryRun: dryRun,
EnableLogCollector: enableLogCollector,
Config: mgr.GetConfig(),
}

components, err := initializer.InitializeAll(ctx, params, mgr.GetClient())
if err != nil {
return fmt.Errorf("initialization failed: %w", err)
}

reconciler := components.FaultRemediationReconciler

defer func() {
if err := reconciler.CloseAll(ctx); err != nil {
slog.Error("failed to close datastore components", "error", err)
}
}()

err = components.FaultRemediationReconciler.SetupWithManager(ctx, mgr)
if err != nil {
return fmt.Errorf("SetupWithManager failed: %w", err)
Expand Down Expand Up @@ -235,20 +189,11 @@ func parseFlags() {
" (otherwise metrics and health are on same port).",
)

flag.StringVar(&kubeconfigPath, "kubeconfig-path", "", "path to kubeconfig file")

flag.StringVar(&tomlConfigPath, "config-path", "/etc/config/config.toml",
"path where the fault remediation config file is present")

flag.BoolVar(&dryRun, "dry-run", false, "flag to run fault remediation module in dry-run mode.")

flag.BoolVar(
&enableControllerRuntime,
"controller-runtime",
false,
"Enable controller runtime management of the reconciler.",
)

flag.BoolVar(
&enableLeaderElection,
"leader-elect",
Expand Down
Loading
Loading