diff --git a/.gitignore b/.gitignore index 8930d91fd..95aa540bc 100644 --- a/.gitignore +++ b/.gitignore @@ -436,6 +436,16 @@ terraform.rc .terraform .terraform.lock.hcl +# ============================================================================ +# Environment Management +# ============================================================================ + +### Version Management Tools ### +mise.toml + +### VSCode ### +*.code-workspace + # ============================================================================ # Project-Specific Files # ============================================================================ diff --git a/.ko.yaml b/.ko.yaml index 20fb53fd8..f02ae3c6b 100644 --- a/.ko.yaml +++ b/.ko.yaml @@ -18,7 +18,6 @@ platforms: [linux/amd64, linux/arm64] env: [CGO_ENABLED=0] builds: - - id: fault-quarantine dir: fault-quarantine main: . @@ -69,7 +68,7 @@ builds: org.opencontainers.image.version: "{{.Env.VERSION}}" org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" - + - id: csp-health-monitor dir: health-monitors/csp-health-monitor main: ./cmd/csp-health-monitor @@ -155,6 +154,23 @@ builds: org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" + - id: janitor-provider + dir: janitor-provider + main: . + ldflags: + - "-s -w" + - "-X main.version={{.Env.VERSION}} -X main.commit={{.Env.GIT_COMMIT}} -X main.date={{.Env.BUILD_DATE}}" + annotations: + org.opencontainers.image.description: "Example provider for NVSentinel Janitor" + labels: + org.opencontainers.image.source: "https://github.com/nvidia/nvsentinel" + org.opencontainers.image.licenses: "Apache-2.0" + org.opencontainers.image.title: "NVSentinel Janitor-Provider" + org.opencontainers.image.description: "Example implementation of the CSP provider interface for Janitor" + org.opencontainers.image.version: "{{.Env.VERSION}}" + org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" + org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" + - id: platform-connectors dir: platform-connectors main: . diff --git a/api/Makefile b/api/Makefile index 07a6dd946..4bd7652ef 100644 --- a/api/Makefile +++ b/api/Makefile @@ -68,7 +68,8 @@ build: protos-generate: $(PROTOC_GEN_GO) $(PROTOC_GEN_GO_GRPC) protos-clean ## Generate Go code from Proto definitions. @echo "Generating Proto code..." @mkdir -p $(GEN_DIR) - cd proto && protoc \ + cd proto && \ + protoc \ -I . \ -I ../$(THIRD_PARTY_DIR) \ --plugin="protoc-gen-go=$(PROTOC_GEN_GO)" \ @@ -77,7 +78,17 @@ protos-generate: $(PROTOC_GEN_GO) $(PROTOC_GEN_GO_GRPC) protos-clean ## Generate --go_opt=paths=source_relative \ --go-grpc_out=../$(GEN_DIR) \ --go-grpc_opt=paths=source_relative \ - device/v1alpha1/*.proto + device/v1alpha1/*.proto && \ + protoc \ + -I . \ + -I ../$(THIRD_PARTY_DIR) \ + --plugin="protoc-gen-go=$(PROTOC_GEN_GO)" \ + --plugin="protoc-gen-go-grpc=$(PROTOC_GEN_GO_GRPC)" \ + --go_out=../$(GEN_DIR) \ + --go_opt=paths=source_relative \ + --go-grpc_out=../$(GEN_DIR) \ + --go-grpc_opt=paths=source_relative \ + csp/v1alpha1/*.proto @echo "Cleaning up dependencies..." go mod tidy @echo "Done." diff --git a/api/gen/go/csp/v1alpha1/provider.pb.go b/api/gen/go/csp/v1alpha1/provider.pb.go new file mode 100644 index 000000000..4c70290e7 --- /dev/null +++ b/api/gen/go/csp/v1alpha1/provider.pb.go @@ -0,0 +1,393 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.10 +// protoc v6.33.0 +// source: csp/v1alpha1/provider.proto + +package cspv1alpha1 + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type SendRebootSignalRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SendRebootSignalRequest) Reset() { + *x = SendRebootSignalRequest{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SendRebootSignalRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SendRebootSignalRequest) ProtoMessage() {} + +func (x *SendRebootSignalRequest) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SendRebootSignalRequest.ProtoReflect.Descriptor instead. +func (*SendRebootSignalRequest) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{0} +} + +func (x *SendRebootSignalRequest) GetNodeName() string { + if x != nil { + return x.NodeName + } + return "" +} + +type SendRebootSignalResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + RequestId string `protobuf:"bytes,1,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SendRebootSignalResponse) Reset() { + *x = SendRebootSignalResponse{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SendRebootSignalResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SendRebootSignalResponse) ProtoMessage() {} + +func (x *SendRebootSignalResponse) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SendRebootSignalResponse.ProtoReflect.Descriptor instead. +func (*SendRebootSignalResponse) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{1} +} + +func (x *SendRebootSignalResponse) GetRequestId() string { + if x != nil { + return x.RequestId + } + return "" +} + +type IsNodeReadyRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"` + RequestId string `protobuf:"bytes,2,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *IsNodeReadyRequest) Reset() { + *x = IsNodeReadyRequest{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *IsNodeReadyRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*IsNodeReadyRequest) ProtoMessage() {} + +func (x *IsNodeReadyRequest) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[2] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use IsNodeReadyRequest.ProtoReflect.Descriptor instead. +func (*IsNodeReadyRequest) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{2} +} + +func (x *IsNodeReadyRequest) GetNodeName() string { + if x != nil { + return x.NodeName + } + return "" +} + +func (x *IsNodeReadyRequest) GetRequestId() string { + if x != nil { + return x.RequestId + } + return "" +} + +type IsNodeReadyResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + IsReady bool `protobuf:"varint,1,opt,name=is_ready,json=isReady,proto3" json:"is_ready,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *IsNodeReadyResponse) Reset() { + *x = IsNodeReadyResponse{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *IsNodeReadyResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*IsNodeReadyResponse) ProtoMessage() {} + +func (x *IsNodeReadyResponse) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[3] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use IsNodeReadyResponse.ProtoReflect.Descriptor instead. +func (*IsNodeReadyResponse) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{3} +} + +func (x *IsNodeReadyResponse) GetIsReady() bool { + if x != nil { + return x.IsReady + } + return false +} + +type SendTerminateSignalRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SendTerminateSignalRequest) Reset() { + *x = SendTerminateSignalRequest{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SendTerminateSignalRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SendTerminateSignalRequest) ProtoMessage() {} + +func (x *SendTerminateSignalRequest) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[4] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SendTerminateSignalRequest.ProtoReflect.Descriptor instead. +func (*SendTerminateSignalRequest) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{4} +} + +func (x *SendTerminateSignalRequest) GetNodeName() string { + if x != nil { + return x.NodeName + } + return "" +} + +type SendTerminateSignalResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + RequestId string `protobuf:"bytes,1,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SendTerminateSignalResponse) Reset() { + *x = SendTerminateSignalResponse{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SendTerminateSignalResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SendTerminateSignalResponse) ProtoMessage() {} + +func (x *SendTerminateSignalResponse) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[5] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SendTerminateSignalResponse.ProtoReflect.Descriptor instead. +func (*SendTerminateSignalResponse) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{5} +} + +func (x *SendTerminateSignalResponse) GetRequestId() string { + if x != nil { + return x.RequestId + } + return "" +} + +var File_csp_v1alpha1_provider_proto protoreflect.FileDescriptor + +const file_csp_v1alpha1_provider_proto_rawDesc = "" + + "\n" + + "\x1bcsp/v1alpha1/provider.proto\x12\"nvidia.nvsentinel.janitor.v1alpha1\"6\n" + + "\x17SendRebootSignalRequest\x12\x1b\n" + + "\tnode_name\x18\x01 \x01(\tR\bnodeName\"9\n" + + "\x18SendRebootSignalResponse\x12\x1d\n" + + "\n" + + "request_id\x18\x01 \x01(\tR\trequestId\"P\n" + + "\x12IsNodeReadyRequest\x12\x1b\n" + + "\tnode_name\x18\x01 \x01(\tR\bnodeName\x12\x1d\n" + + "\n" + + "request_id\x18\x02 \x01(\tR\trequestId\"0\n" + + "\x13IsNodeReadyResponse\x12\x19\n" + + "\bis_ready\x18\x01 \x01(\bR\aisReady\"9\n" + + "\x1aSendTerminateSignalRequest\x12\x1b\n" + + "\tnode_name\x18\x01 \x01(\tR\bnodeName\"<\n" + + "\x1bSendTerminateSignalResponse\x12\x1d\n" + + "\n" + + "request_id\x18\x01 \x01(\tR\trequestId2\xc4\x03\n" + + "\x12CSPProviderService\x12\x8f\x01\n" + + "\x10SendRebootSignal\x12;.nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalRequest\x1a<.nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalResponse\"\x00\x12\x80\x01\n" + + "\vIsNodeReady\x126.nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyRequest\x1a7.nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyResponse\"\x00\x12\x98\x01\n" + + "\x13SendTerminateSignal\x12>.nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalRequest\x1a?.nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalResponse\"\x00BCZAgithub.com/nvidia/nvsentinel/janitor/api/csp/v1alpha1;cspv1alpha1b\x06proto3" + +var ( + file_csp_v1alpha1_provider_proto_rawDescOnce sync.Once + file_csp_v1alpha1_provider_proto_rawDescData []byte +) + +func file_csp_v1alpha1_provider_proto_rawDescGZIP() []byte { + file_csp_v1alpha1_provider_proto_rawDescOnce.Do(func() { + file_csp_v1alpha1_provider_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_csp_v1alpha1_provider_proto_rawDesc), len(file_csp_v1alpha1_provider_proto_rawDesc))) + }) + return file_csp_v1alpha1_provider_proto_rawDescData +} + +var file_csp_v1alpha1_provider_proto_msgTypes = make([]protoimpl.MessageInfo, 6) +var file_csp_v1alpha1_provider_proto_goTypes = []any{ + (*SendRebootSignalRequest)(nil), // 0: nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalRequest + (*SendRebootSignalResponse)(nil), // 1: nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalResponse + (*IsNodeReadyRequest)(nil), // 2: nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyRequest + (*IsNodeReadyResponse)(nil), // 3: nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyResponse + (*SendTerminateSignalRequest)(nil), // 4: nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalRequest + (*SendTerminateSignalResponse)(nil), // 5: nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalResponse +} +var file_csp_v1alpha1_provider_proto_depIdxs = []int32{ + 0, // 0: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.SendRebootSignal:input_type -> nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalRequest + 2, // 1: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.IsNodeReady:input_type -> nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyRequest + 4, // 2: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.SendTerminateSignal:input_type -> nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalRequest + 1, // 3: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.SendRebootSignal:output_type -> nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalResponse + 3, // 4: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.IsNodeReady:output_type -> nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyResponse + 5, // 5: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.SendTerminateSignal:output_type -> nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalResponse + 3, // [3:6] is the sub-list for method output_type + 0, // [0:3] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_csp_v1alpha1_provider_proto_init() } +func file_csp_v1alpha1_provider_proto_init() { + if File_csp_v1alpha1_provider_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_csp_v1alpha1_provider_proto_rawDesc), len(file_csp_v1alpha1_provider_proto_rawDesc)), + NumEnums: 0, + NumMessages: 6, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_csp_v1alpha1_provider_proto_goTypes, + DependencyIndexes: file_csp_v1alpha1_provider_proto_depIdxs, + MessageInfos: file_csp_v1alpha1_provider_proto_msgTypes, + }.Build() + File_csp_v1alpha1_provider_proto = out.File + file_csp_v1alpha1_provider_proto_goTypes = nil + file_csp_v1alpha1_provider_proto_depIdxs = nil +} diff --git a/api/gen/go/csp/v1alpha1/provider_grpc.pb.go b/api/gen/go/csp/v1alpha1/provider_grpc.pb.go new file mode 100644 index 000000000..1e5ea5abf --- /dev/null +++ b/api/gen/go/csp/v1alpha1/provider_grpc.pb.go @@ -0,0 +1,211 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.5.1 +// - protoc v6.33.0 +// source: csp/v1alpha1/provider.proto + +package cspv1alpha1 + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.64.0 or later. +const _ = grpc.SupportPackageIsVersion9 + +const ( + CSPProviderService_SendRebootSignal_FullMethodName = "/nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService/SendRebootSignal" + CSPProviderService_IsNodeReady_FullMethodName = "/nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService/IsNodeReady" + CSPProviderService_SendTerminateSignal_FullMethodName = "/nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService/SendTerminateSignal" +) + +// CSPProviderServiceClient is the client API for CSPProviderService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type CSPProviderServiceClient interface { + SendRebootSignal(ctx context.Context, in *SendRebootSignalRequest, opts ...grpc.CallOption) (*SendRebootSignalResponse, error) + IsNodeReady(ctx context.Context, in *IsNodeReadyRequest, opts ...grpc.CallOption) (*IsNodeReadyResponse, error) + SendTerminateSignal(ctx context.Context, in *SendTerminateSignalRequest, opts ...grpc.CallOption) (*SendTerminateSignalResponse, error) +} + +type cSPProviderServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewCSPProviderServiceClient(cc grpc.ClientConnInterface) CSPProviderServiceClient { + return &cSPProviderServiceClient{cc} +} + +func (c *cSPProviderServiceClient) SendRebootSignal(ctx context.Context, in *SendRebootSignalRequest, opts ...grpc.CallOption) (*SendRebootSignalResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(SendRebootSignalResponse) + err := c.cc.Invoke(ctx, CSPProviderService_SendRebootSignal_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *cSPProviderServiceClient) IsNodeReady(ctx context.Context, in *IsNodeReadyRequest, opts ...grpc.CallOption) (*IsNodeReadyResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(IsNodeReadyResponse) + err := c.cc.Invoke(ctx, CSPProviderService_IsNodeReady_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *cSPProviderServiceClient) SendTerminateSignal(ctx context.Context, in *SendTerminateSignalRequest, opts ...grpc.CallOption) (*SendTerminateSignalResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(SendTerminateSignalResponse) + err := c.cc.Invoke(ctx, CSPProviderService_SendTerminateSignal_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +// CSPProviderServiceServer is the server API for CSPProviderService service. +// All implementations must embed UnimplementedCSPProviderServiceServer +// for forward compatibility. +type CSPProviderServiceServer interface { + SendRebootSignal(context.Context, *SendRebootSignalRequest) (*SendRebootSignalResponse, error) + IsNodeReady(context.Context, *IsNodeReadyRequest) (*IsNodeReadyResponse, error) + SendTerminateSignal(context.Context, *SendTerminateSignalRequest) (*SendTerminateSignalResponse, error) + mustEmbedUnimplementedCSPProviderServiceServer() +} + +// UnimplementedCSPProviderServiceServer must be embedded to have +// forward compatible implementations. +// +// NOTE: this should be embedded by value instead of pointer to avoid a nil +// pointer dereference when methods are called. +type UnimplementedCSPProviderServiceServer struct{} + +func (UnimplementedCSPProviderServiceServer) SendRebootSignal(context.Context, *SendRebootSignalRequest) (*SendRebootSignalResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method SendRebootSignal not implemented") +} +func (UnimplementedCSPProviderServiceServer) IsNodeReady(context.Context, *IsNodeReadyRequest) (*IsNodeReadyResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method IsNodeReady not implemented") +} +func (UnimplementedCSPProviderServiceServer) SendTerminateSignal(context.Context, *SendTerminateSignalRequest) (*SendTerminateSignalResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method SendTerminateSignal not implemented") +} +func (UnimplementedCSPProviderServiceServer) mustEmbedUnimplementedCSPProviderServiceServer() {} +func (UnimplementedCSPProviderServiceServer) testEmbeddedByValue() {} + +// UnsafeCSPProviderServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to CSPProviderServiceServer will +// result in compilation errors. +type UnsafeCSPProviderServiceServer interface { + mustEmbedUnimplementedCSPProviderServiceServer() +} + +func RegisterCSPProviderServiceServer(s grpc.ServiceRegistrar, srv CSPProviderServiceServer) { + // If the following call pancis, it indicates UnimplementedCSPProviderServiceServer was + // embedded by pointer and is nil. This will cause panics if an + // unimplemented method is ever invoked, so we test this at initialization + // time to prevent it from happening at runtime later due to I/O. + if t, ok := srv.(interface{ testEmbeddedByValue() }); ok { + t.testEmbeddedByValue() + } + s.RegisterService(&CSPProviderService_ServiceDesc, srv) +} + +func _CSPProviderService_SendRebootSignal_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(SendRebootSignalRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(CSPProviderServiceServer).SendRebootSignal(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: CSPProviderService_SendRebootSignal_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(CSPProviderServiceServer).SendRebootSignal(ctx, req.(*SendRebootSignalRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _CSPProviderService_IsNodeReady_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(IsNodeReadyRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(CSPProviderServiceServer).IsNodeReady(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: CSPProviderService_IsNodeReady_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(CSPProviderServiceServer).IsNodeReady(ctx, req.(*IsNodeReadyRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _CSPProviderService_SendTerminateSignal_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(SendTerminateSignalRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(CSPProviderServiceServer).SendTerminateSignal(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: CSPProviderService_SendTerminateSignal_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(CSPProviderServiceServer).SendTerminateSignal(ctx, req.(*SendTerminateSignalRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// CSPProviderService_ServiceDesc is the grpc.ServiceDesc for CSPProviderService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var CSPProviderService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService", + HandlerType: (*CSPProviderServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "SendRebootSignal", + Handler: _CSPProviderService_SendRebootSignal_Handler, + }, + { + MethodName: "IsNodeReady", + Handler: _CSPProviderService_IsNodeReady_Handler, + }, + { + MethodName: "SendTerminateSignal", + Handler: _CSPProviderService_SendTerminateSignal_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "csp/v1alpha1/provider.proto", +} diff --git a/api/proto/csp/v1alpha1/provider.proto b/api/proto/csp/v1alpha1/provider.proto new file mode 100644 index 000000000..51b6875ef --- /dev/null +++ b/api/proto/csp/v1alpha1/provider.proto @@ -0,0 +1,49 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; +package nvidia.nvsentinel.janitor.v1alpha1; + +option go_package = "github.com/nvidia/nvsentinel/janitor/api/csp/v1alpha1;cspv1alpha1"; + +service CSPProviderService { + rpc SendRebootSignal(SendRebootSignalRequest) returns (SendRebootSignalResponse) {} + rpc IsNodeReady(IsNodeReadyRequest) returns (IsNodeReadyResponse) {} + rpc SendTerminateSignal(SendTerminateSignalRequest) returns (SendTerminateSignalResponse) {} +} + +message SendRebootSignalRequest { + string node_name = 1; +} + +message SendRebootSignalResponse { + string request_id = 1; +} + +message IsNodeReadyRequest { + string node_name = 1; + string request_id = 2; +} + +message IsNodeReadyResponse { + bool is_ready = 1; +} + +message SendTerminateSignalRequest { + string node_name = 1; +} + +message SendTerminateSignalResponse { + string request_id = 1; +} diff --git a/distros/kubernetes/nvsentinel/Chart.yaml b/distros/kubernetes/nvsentinel/Chart.yaml index 14a586f2d..af7ac2670 100644 --- a/distros/kubernetes/nvsentinel/Chart.yaml +++ b/distros/kubernetes/nvsentinel/Chart.yaml @@ -55,6 +55,9 @@ dependencies: - name: janitor version: "0.1.0" condition: global.janitor.enabled + - name: janitor-provider + version: "0.1.0" + condition: global.janitorProvider.enabled - name: metadata-collector version: "0.1.0" condition: global.metadataCollector.enabled diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider/Chart.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider/Chart.yaml new file mode 100644 index 000000000..57fa53d06 --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider/Chart.yaml @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: janitor-provider +description: Example provider for NVSentinel Janitor +type: application +version: 0.1.0 diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/_helpers.tpl b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/_helpers.tpl new file mode 100644 index 000000000..e43ceab3d --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "provider.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "provider.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "provider.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "provider.labels" -}} +helm.sh/chart: {{ include "provider.chart" . }} +{{ include "provider.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "provider.selectorLabels" -}} +app.kubernetes.io/name: {{ include "provider.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "provider.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "provider.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/clusterrole.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/clusterrole.yaml new file mode 100644 index 000000000..bb9bc1912 --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/clusterrole.yaml @@ -0,0 +1,28 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "provider.fullname" . }} + labels: + {{- include "provider.labels" . | nindent 4 }} +rules: + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/clusterrolebinding.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/clusterrolebinding.yaml new file mode 100644 index 000000000..f8670376c --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/clusterrolebinding.yaml @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "provider.fullname" . }} + labels: + {{- include "provider.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "provider.fullname" . }} +subjects: + - kind: ServiceAccount + name: {{ include "provider.fullname" . }} + namespace: {{ .Release.Namespace }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/deployment.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/deployment.yaml new file mode 100644 index 000000000..5ef932f46 --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/deployment.yaml @@ -0,0 +1,168 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "provider.fullname" . }} + labels: + {{- include "provider.labels" . | nindent 4}} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "provider.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with ((.Values.global).podAnnotations | default .Values.podAnnotations) }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "provider.selectorLabels" . | nindent 8 }} + {{- if eq (.Values.csp.provider | default "kind") "azure" }} + {{- if .Values.csp.azure.clientId }} + # Azure Workload Identity label (required for pod identity) + azure.workload.identity/use: "true" + {{- end }} + {{- end }} + spec: + {{- with ((.Values.global).imagePullSecrets | default .Values.imagePullSecrets) }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "provider.serviceAccountName" . }} + containers: + - name: janitor-provider + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + ports: + - name: metrics + containerPort: {{ ((.Values.global).metricsPort) | default "2112" }} + - name: service + containerPort: {{ .Values.service.port | default "50051" }} + env: + - name: JANITOR_PROVIDER_PORT + value: {{ .Values.service.port | default 50051 | quote }} + - name: METRICS_PORT + value: {{ ((.Values.global).metricsPort) | default 2112 | quote }} + # Cloud Service Provider configuration + - name: CSP + value: {{ .Values.csp.provider | default "kind" | quote }} + {{- if eq (.Values.csp.provider | default "kind") "aws" }} + # AWS-specific environment variables + {{- if .Values.csp.aws.region }} + - name: AWS_REGION + value: {{ .Values.csp.aws.region | quote }} + {{- end }} + {{- end }} + {{- if eq (.Values.csp.provider | default "kind") "gcp" }} + # GCP-specific environment variables + {{- if .Values.csp.gcp.project }} + - name: GCP_PROJECT + value: {{ .Values.csp.gcp.project | quote }} + {{- end }} + {{- if .Values.csp.gcp.zone }} + - name: GCP_ZONE + value: {{ .Values.csp.gcp.zone | quote }} + {{- end }} + {{- end }} + {{- if eq (.Values.csp.provider | default "kind") "azure" }} + # Azure-specific environment variables + {{- if .Values.csp.azure.subscriptionId }} + - name: AZURE_SUBSCRIPTION_ID + value: {{ .Values.csp.azure.subscriptionId | quote }} + {{- end }} + {{- if .Values.csp.azure.resourceGroup }} + - name: AZURE_RESOURCE_GROUP + value: {{ .Values.csp.azure.resourceGroup | quote }} + {{- end }} + {{- if .Values.csp.azure.location }} + - name: AZURE_LOCATION + value: {{ .Values.csp.azure.location | quote }} + {{- end }} + {{- end }} + {{- if eq (.Values.csp.provider | default "kind") "oci" }} + # OCI-specific environment variables + {{- if .Values.csp.oci.region }} + - name: OCI_REGION + value: {{ .Values.csp.oci.region | quote }} + {{- end }} + {{- if .Values.csp.oci.compartment }} + - name: OCI_COMPARTMENT + value: {{ .Values.csp.oci.compartment | quote }} + {{- end }} + {{- if .Values.csp.oci.credentialsFile }} + - name: OCI_CREDENTIALS_FILE + value: {{ .Values.csp.oci.credentialsFile | quote }} + {{- end }} + {{- if .Values.csp.oci.profile }} + - name: OCI_PROFILE + value: {{ .Values.csp.oci.profile | quote }} + {{- end }} + {{- end }} + {{- if eq (.Values.csp.provider | default "kind") "nebius" }} + # Nebius-specific environment variables + {{- if .Values.csp.nebius.serviceAccountKeySecret }} + # When using serviceAccountKeySecret, the key is mounted at /etc/nebius/sa-credentials.json + - name: NEBIUS_SA_KEY_FILE + value: "/etc/nebius/sa-credentials.json" + {{- else if .Values.csp.nebius.serviceAccountKeyFile }} + - name: NEBIUS_SA_KEY_FILE + value: {{ .Values.csp.nebius.serviceAccountKeyFile | quote }} + {{- end }} + {{- /* iamTokenSecretRef takes precedence over iamToken if both are set */}} + {{- if .Values.csp.nebius.iamTokenSecretRef }} + - name: NEBIUS_IAM_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Values.csp.nebius.iamTokenSecretRef.name }} + key: {{ .Values.csp.nebius.iamTokenSecretRef.key | default "token" }} + {{- else if .Values.csp.nebius.iamToken }} + - name: NEBIUS_IAM_TOKEN + value: {{ .Values.csp.nebius.iamToken | quote }} + {{- end }} + {{- end }} + {{- with .Values.extraEnv }} + {{- toYaml . | nindent 12 }} + {{- end }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + {{- if and (eq (.Values.csp.provider | default "kind") "nebius") .Values.csp.nebius.serviceAccountKeySecret }} + volumeMounts: + - name: nebius-sa-key + mountPath: /etc/nebius + readOnly: true + {{- end }} + {{- if and (eq (.Values.csp.provider | default "kind") "nebius") .Values.csp.nebius.serviceAccountKeySecret }} + volumes: + - name: nebius-sa-key + secret: + secretName: {{ .Values.csp.nebius.serviceAccountKeySecret }} + defaultMode: 420 + {{- end }} + restartPolicy: Always + {{- with (((.Values.global).systemNodeSelector) | default .Values.nodeSelector) }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with (((.Values.global).affinity) | default .Values.affinity) }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with (((.Values.global).systemNodeTolerations) | default .Values.tolerations) }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/networkpolicy.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/networkpolicy.yaml new file mode 100644 index 000000000..a5866759f --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/networkpolicy.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "provider.fullname" . }}-allow-service-traffic + labels: + {{- include "provider.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + {{- include "provider.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + ingress: + - ports: + - port: {{ .Values.service.port | default 50051 }} + protocol: TCP + diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/service.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/service.yaml new file mode 100644 index 000000000..6f3576c28 --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/service.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: v1 +kind: Service +metadata: + name: {{ include "provider.fullname" . }} + labels: + {{- include "provider.labels" . | nindent 4}} +spec: + type: {{ .Values.service.type | default "ClusterIP" }} + selector: + {{- include "provider.selectorLabels" . | nindent 4 }} + ports: + - name: service + port: {{ .Values.service.port | default 50051 }} + targetPort: {{ .Values.service.port | default 50051 }} + - name: metrics + port: {{ ((.Values.global).metricsPort) | default 2112 }} + targetPort: {{ ((.Values.global).metricsPort) | default 2112 }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/serviceaccount.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/serviceaccount.yaml new file mode 100644 index 000000000..ca920e554 --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider/templates/serviceaccount.yaml @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "provider.serviceAccountName" . }} + labels: + {{- include "provider.labels" . | nindent 4 }} + annotations: + {{- with .Values.serviceAccount.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- if eq (.Values.csp.provider | default "kind") "aws" }} + {{- if and .Values.csp.aws.accountId .Values.csp.aws.iamRoleName }} + # AWS IRSA (IAM Roles for Service Accounts) annotation + eks.amazonaws.com/role-arn: arn:aws:iam::{{ .Values.csp.aws.accountId }}:role/{{ .Values.csp.aws.iamRoleName }} + {{- end }} + {{- end }} + {{- if eq (.Values.csp.provider | default "kind") "gcp" }} + {{- if and .Values.csp.gcp.project .Values.csp.gcp.serviceAccount }} + # GCP Workload Identity annotation + iam.gke.io/gcp-service-account: {{ .Values.csp.gcp.serviceAccount }}@{{ .Values.csp.gcp.project }}.iam.gserviceaccount.com + {{- end }} + {{- end }} + {{- if eq (.Values.csp.provider | default "kind") "azure" }} + {{- if .Values.csp.azure.clientId }} + # Azure Workload Identity annotations + azure.workload.identity/client-id: {{ .Values.csp.azure.clientId | quote }} + azure.workload.identity/use: "true" + {{- end }} + {{- end }} + {{- if eq (.Values.csp.provider | default "kind") "oci" }} + {{- if and .Values.csp.oci.compartment .Values.csp.oci.principalId }} + # OCI Workload Identity annotations + oke.oraclecloud.com/compartment-ocid: {{ .Values.csp.oci.compartment | quote }} + oke.oraclecloud.com/principal-ocid: {{ .Values.csp.oci.principalId | quote }} + {{- end }} + {{- end }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider/values.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider/values.yaml new file mode 100644 index 000000000..dc963ba94 --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider/values.yaml @@ -0,0 +1,191 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +replicaCount: 1 + +# Image configuration +image: + repository: ghcr.io/nvidia/nvsentinel/janitor-provider + pullPolicy: IfNotPresent + tag: "" + +nameOverride: "" +fullnameOverride: "janitor-provider" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} +podLabels: {} + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +readinessProbe: + httpGet: + path: /healthz + port: metrics + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +service: + port: 50051 + +extraEnv: [] + +# Cloud Service Provider (CSP) Configuration +# The janitor-provider module supports multiple cloud providers for node reboot operations +# Configure the appropriate CSP for your environment +csp: + # CSP provider type (kind, kwok, aws, gcp, azure, oci, nebius) + # - kind: For local development with kind clusters (simulated reboots) + # - kwok: For testing with kwok (simulated nodes) + # - aws: For AWS EKS clusters + # - gcp: For Google Cloud GKE clusters + # - azure: For Microsoft Azure AKS clusters + # - oci: For Oracle Cloud Infrastructure OKE clusters + # - nebius: For Nebius Managed Kubernetes (MK8s) clusters + provider: "kind" + + # AWS-specific configuration (only required when provider=aws) + aws: + # AWS region where the EKS cluster is running + # This is required for EC2 API calls to reboot nodes + # Example: us-east-1, us-west-2, eu-west-1 + region: "" + # AWS Account ID (required for IRSA - IAM Roles for Service Accounts) + # Example: "123456789012" + accountId: "" + # IAM Role name for the janitor service account (IRSA) + # This role should have permissions to call EC2 RebootInstances + # Example: "eks-cluster-name-janitor" or "nvsentinel-janitor-role" + iamRoleName: "" + # Note: AWS credentials are typically provided via: + # - IAM roles for service accounts (IRSA) - RECOMMENDED for EKS + # Requires accountId and iamRoleName to be set + # The IAM role must have ec2:RebootInstances permission + + # Google Cloud Platform (GCP) specific configuration (only required when provider=gcp) + gcp: + # GCP project ID where the GKE cluster is running + # This is used for Compute Engine API calls + project: "" + # GCP zone where the GKE cluster nodes are located + # Example: us-central1-a, us-west1-b, europe-west1-c + zone: "" + # GCP Service Account email for Workload Identity (without @project.iam.gserviceaccount.com) + # Example: "nvsentinel-janitor" (will be formatted as nvsentinel-janitor@project-id.iam.gserviceaccount.com) + # This service account must have compute.instances.reset permission + serviceAccount: "" + # Note: GCP credentials are typically provided via: + # - Workload Identity - RECOMMENDED for GKE + # Requires project and serviceAccount to be set + # The GCP SA must have roles/compute.instanceAdmin.v1 or compute.instances.reset permission + # Must bind Kubernetes SA to GCP SA: gcloud iam service-accounts add-iam-policy-binding + + # Microsoft Azure specific configuration (only required when provider=azure) + azure: + # Azure subscription ID containing the AKS cluster + # Required for Azure Resource Manager API calls + subscriptionId: "" + # Azure resource group name containing the AKS cluster + # This can often be auto-detected from node provider IDs + resourceGroup: "" + # Azure region/location where the AKS cluster is deployed + # Example: eastus, westus2, westeurope + location: "" + # Azure Managed Identity Client ID for Workload Identity + # Example: "12345678-1234-1234-1234-123456789012" + # This managed identity must have Virtual Machine Contributor role or restart permission + clientId: "" + # Note: Azure credentials are typically provided via: + # - Managed Identity (Workload Identity) - RECOMMENDED for AKS + # Requires clientId to be set + # The managed identity must have Microsoft.Compute/virtualMachines/restart/action permission + # Must create federated identity credential for the Kubernetes service account + + # Oracle Cloud Infrastructure (OCI) specific configuration (only required when provider=oci) + oci: + # OCI region where the OKE cluster is running + # Example: us-ashburn-1, us-phoenix-1, uk-london-1 + region: "" + # OCI compartment OCID containing the cluster resources + # Format: ocid1.compartment.oc1..aaa... + compartment: "" + # Path to OCI credentials file (optional, uses workload identity by default) + credentialsFile: "" + # OCI profile name in credentials file (default: DEFAULT) + profile: "DEFAULT" + # OCI Principal OCID for Workload Identity + # Format: ocid1.principal.oc1..aaa... + # This is the OCID of the dynamic group or service principal + principalId: "" + # Note: OCI credentials are typically provided via: + # - OKE Workload Identity - RECOMMENDED for OKE (default) + # Requires compartment and principalId to be set + # The principal must have manage instance-family permission in compartment + # Must create dynamic group with matching rule for the pod + + # Nebius Managed Kubernetes (MK8s) specific configuration (only required when provider=nebius) + # Uses official Nebius Go SDK: github.com/nebius/gosdk + nebius: + # Path to service account key file (recommended for production) + # JSON format as documented at: https://github.com/nebius/gosdk#using-a-json-credentials-file + # Example: "/etc/nebius/sa-credentials.json" + # Set via NEBIUS_SA_KEY_FILE environment variable + serviceAccountKeyFile: "" + # Secret containing service account key (recommended for production with Kubernetes) + # The secret should contain the key file as 'sa-credentials.json' + # When set, automatically mounts at /etc/nebius and sets NEBIUS_SA_KEY_FILE + serviceAccountKeySecret: "" + # Direct IAM token (for testing only) + # Tokens expire and require manual refresh - not recommended for production + # Set via NEBIUS_IAM_TOKEN environment variable + # Obtain token: nebius iam get-access-token + iamToken: "" + # Reference to secret containing IAM token (alternative to iamToken value) + # Example: + # iamTokenSecretRef: + # name: nebius-janitor-token + # key: token + iamTokenSecretRef: {} + # Note: Nebius credentials are provided via environment variables: + # - NEBIUS_SA_KEY_FILE: Path to service account credentials JSON file (recommended) + # The service account must have compute.instances.stop and compute.instances.start permissions + # - NEBIUS_IAM_TOKEN: Direct IAM token (for testing only) diff --git a/distros/kubernetes/nvsentinel/charts/janitor/templates/configmap.yaml b/distros/kubernetes/nvsentinel/charts/janitor/templates/configmap.yaml index 4b6ab4274..688af360a 100644 --- a/distros/kubernetes/nvsentinel/charts/janitor/templates/configmap.yaml +++ b/distros/kubernetes/nvsentinel/charts/janitor/templates/configmap.yaml @@ -47,13 +47,20 @@ data: {{- end }} {{- end }} {{- end }} + cspProviderHost: {{ .Values.config.cspProviderHost }} rebootNodeController: enabled: {{ if (hasKey .Values.config.controllers.rebootNode "enabled") }}{{ .Values.config.controllers.rebootNode.enabled }}{{ else }}true{{ end }} timeout: {{ .Values.config.controllers.rebootNode.timeout | default .Values.config.timeout | default "25m" }} manualMode: {{ .Values.config.manualMode | default false }} - + {{- if .Values.config.controllers.rebootNode.cspProviderHost }} + cspProviderHost: {{ .Values.config.controllers.rebootNode.cspProviderHost }} + {{- end }} + terminateNodeController: enabled: {{ if (hasKey .Values.config.controllers.terminateNode "enabled") }}{{ .Values.config.controllers.terminateNode.enabled }}{{ else }}true{{ end }} timeout: {{ .Values.config.controllers.terminateNode.timeout | default .Values.config.timeout | default "25m" }} manualMode: {{ .Values.config.manualMode | default false }} + {{- if .Values.config.controllers.terminateNode.cspProviderHost }} + cspProviderHost: {{ .Values.config.controllers.terminateNode.cspProviderHost }} + {{- end }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor/templates/deployment.yaml b/distros/kubernetes/nvsentinel/charts/janitor/templates/deployment.yaml index f31747241..8be30f597 100755 --- a/distros/kubernetes/nvsentinel/charts/janitor/templates/deployment.yaml +++ b/distros/kubernetes/nvsentinel/charts/janitor/templates/deployment.yaml @@ -30,12 +30,6 @@ spec: {{- end }} labels: {{- include "janitor.selectorLabels" . | nindent 8 }} - {{- if eq (.Values.csp.provider | default "kind") "azure" }} - {{- if .Values.csp.azure.clientId }} - # Azure Workload Identity label (required for pod identity) - azure.workload.identity/use: "true" - {{- end }} - {{- end }} spec: {{- with ((.Values.global).imagePullSecrets | default .Values.imagePullSecrets) }} imagePullSecrets: @@ -75,83 +69,6 @@ spec: {{- if .Values.global.auditLogging.enabled }} {{- include "nvsentinel.auditLogging.envVars" . | nindent 12 }} {{- end }} - # Cloud Service Provider configuration - - name: CSP - value: {{ .Values.csp.provider | default "kind" | quote }} - {{- if eq (.Values.csp.provider | default "kind") "aws" }} - # AWS-specific environment variables - {{- if .Values.csp.aws.region }} - - name: AWS_REGION - value: {{ .Values.csp.aws.region | quote }} - {{- end }} - {{- end }} - {{- if eq (.Values.csp.provider | default "kind") "gcp" }} - # GCP-specific environment variables - {{- if .Values.csp.gcp.project }} - - name: GCP_PROJECT - value: {{ .Values.csp.gcp.project | quote }} - {{- end }} - {{- if .Values.csp.gcp.zone }} - - name: GCP_ZONE - value: {{ .Values.csp.gcp.zone | quote }} - {{- end }} - {{- end }} - {{- if eq (.Values.csp.provider | default "kind") "azure" }} - # Azure-specific environment variables - {{- if .Values.csp.azure.subscriptionId }} - - name: AZURE_SUBSCRIPTION_ID - value: {{ .Values.csp.azure.subscriptionId | quote }} - {{- end }} - {{- if .Values.csp.azure.resourceGroup }} - - name: AZURE_RESOURCE_GROUP - value: {{ .Values.csp.azure.resourceGroup | quote }} - {{- end }} - {{- if .Values.csp.azure.location }} - - name: AZURE_LOCATION - value: {{ .Values.csp.azure.location | quote }} - {{- end }} - {{- end }} - {{- if eq (.Values.csp.provider | default "kind") "oci" }} - # OCI-specific environment variables - {{- if .Values.csp.oci.region }} - - name: OCI_REGION - value: {{ .Values.csp.oci.region | quote }} - {{- end }} - {{- if .Values.csp.oci.compartment }} - - name: OCI_COMPARTMENT - value: {{ .Values.csp.oci.compartment | quote }} - {{- end }} - {{- if .Values.csp.oci.credentialsFile }} - - name: OCI_CREDENTIALS_FILE - value: {{ .Values.csp.oci.credentialsFile | quote }} - {{- end }} - {{- if .Values.csp.oci.profile }} - - name: OCI_PROFILE - value: {{ .Values.csp.oci.profile | quote }} - {{- end }} - {{- end }} - {{- if eq (.Values.csp.provider | default "kind") "nebius" }} - # Nebius-specific environment variables - {{- if .Values.csp.nebius.serviceAccountKeySecret }} - # When using serviceAccountKeySecret, the key is mounted at /etc/nebius/sa-credentials.json - - name: NEBIUS_SA_KEY_FILE - value: "/etc/nebius/sa-credentials.json" - {{- else if .Values.csp.nebius.serviceAccountKeyFile }} - - name: NEBIUS_SA_KEY_FILE - value: {{ .Values.csp.nebius.serviceAccountKeyFile | quote }} - {{- end }} - {{- /* iamTokenSecretRef takes precedence over iamToken if both are set */}} - {{- if .Values.csp.nebius.iamTokenSecretRef }} - - name: NEBIUS_IAM_TOKEN - valueFrom: - secretKeyRef: - name: {{ .Values.csp.nebius.iamTokenSecretRef.name }} - key: {{ .Values.csp.nebius.iamTokenSecretRef.key | default "token" }} - {{- else if .Values.csp.nebius.iamToken }} - - name: NEBIUS_IAM_TOKEN - value: {{ .Values.csp.nebius.iamToken | quote }} - {{- end }} - {{- end }} {{- with .Values.extraEnv }} {{- toYaml . | nindent 12 }} {{- end }} @@ -179,11 +96,6 @@ spec: {{- if .Values.global.auditLogging.enabled }} {{- include "nvsentinel.auditLogging.volumeMount" . | nindent 12 }} {{- end }} - {{- if and (eq (.Values.csp.provider | default "kind") "nebius") .Values.csp.nebius.serviceAccountKeySecret }} - - name: nebius-sa-key - mountPath: /etc/nebius - readOnly: true - {{- end }} restartPolicy: Always volumes: - name: config @@ -209,12 +121,6 @@ spec: {{- if .Values.global.auditLogging.enabled }} {{- include "nvsentinel.auditLogging.volume" . | nindent 8 }} {{- end }} - {{- if and (eq (.Values.csp.provider | default "kind") "nebius") .Values.csp.nebius.serviceAccountKeySecret }} - - name: nebius-sa-key - secret: - secretName: {{ .Values.csp.nebius.serviceAccountKeySecret }} - defaultMode: 420 - {{- end }} {{- with (((.Values.global).systemNodeSelector) | default .Values.nodeSelector) }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor/templates/serviceaccount.yaml b/distros/kubernetes/nvsentinel/charts/janitor/templates/serviceaccount.yaml index d9a3a66ca..0724c9d69 100644 --- a/distros/kubernetes/nvsentinel/charts/janitor/templates/serviceaccount.yaml +++ b/distros/kubernetes/nvsentinel/charts/janitor/templates/serviceaccount.yaml @@ -18,32 +18,6 @@ metadata: labels: {{- include "janitor.labels" . | nindent 4 }} annotations: - {{- if eq (.Values.csp.provider | default "kind") "aws" }} - {{- if and .Values.csp.aws.accountId .Values.csp.aws.iamRoleName }} - # AWS IRSA (IAM Roles for Service Accounts) annotation - eks.amazonaws.com/role-arn: arn:aws:iam::{{ .Values.csp.aws.accountId }}:role/{{ .Values.csp.aws.iamRoleName }} - {{- end }} - {{- end }} - {{- if eq (.Values.csp.provider | default "kind") "gcp" }} - {{- if and .Values.csp.gcp.project .Values.csp.gcp.serviceAccount }} - # GCP Workload Identity annotation - iam.gke.io/gcp-service-account: {{ .Values.csp.gcp.serviceAccount }}@{{ .Values.csp.gcp.project }}.iam.gserviceaccount.com - {{- end }} - {{- end }} - {{- if eq (.Values.csp.provider | default "kind") "azure" }} - {{- if .Values.csp.azure.clientId }} - # Azure Workload Identity annotations - azure.workload.identity/client-id: {{ .Values.csp.azure.clientId | quote }} - azure.workload.identity/use: "true" - {{- end }} - {{- end }} - {{- if eq (.Values.csp.provider | default "kind") "oci" }} - {{- if and .Values.csp.oci.compartment .Values.csp.oci.principalId }} - # OCI Workload Identity annotations - oke.oraclecloud.com/compartment-ocid: {{ .Values.csp.oci.compartment | quote }} - oke.oraclecloud.com/principal-ocid: {{ .Values.csp.oci.principalId | quote }} - {{- end }} - {{- end }} {{- with .Values.serviceAccount.annotations }} {{- toYaml . | nindent 4 }} {{- end }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor/values.yaml b/distros/kubernetes/nvsentinel/charts/janitor/values.yaml index e4bca1a63..537bea9c7 100644 --- a/distros/kubernetes/nvsentinel/charts/janitor/values.yaml +++ b/distros/kubernetes/nvsentinel/charts/janitor/values.yaml @@ -137,127 +137,6 @@ config: # If not set or set to empty, defaults to config.timeout (25m) timeout: "25m" -# Cloud Service Provider (CSP) Configuration -# The janitor module supports multiple cloud providers for node reboot operations -# Configure the appropriate CSP for your environment -csp: - # CSP provider type (kind, kwok, aws, gcp, azure, oci, nebius) - # - kind: For local development with kind clusters (simulated reboots) - # - kwok: For testing with kwok (simulated nodes) - # - aws: For AWS EKS clusters - # - gcp: For Google Cloud GKE clusters - # - azure: For Microsoft Azure AKS clusters - # - oci: For Oracle Cloud Infrastructure OKE clusters - # - nebius: For Nebius Managed Kubernetes (MK8s) clusters - provider: "kind" - - # AWS-specific configuration (only required when provider=aws) - aws: - # AWS region where the EKS cluster is running - # This is required for EC2 API calls to reboot nodes - # Example: us-east-1, us-west-2, eu-west-1 - region: "" - # AWS Account ID (required for IRSA - IAM Roles for Service Accounts) - # Example: "123456789012" - accountId: "" - # IAM Role name for the janitor service account (IRSA) - # This role should have permissions to call EC2 RebootInstances - # Example: "eks-cluster-name-janitor" or "nvsentinel-janitor-role" - iamRoleName: "" - # Note: AWS credentials are typically provided via: - # - IAM roles for service accounts (IRSA) - RECOMMENDED for EKS - # Requires accountId and iamRoleName to be set - # The IAM role must have ec2:RebootInstances permission - - # Google Cloud Platform (GCP) specific configuration (only required when provider=gcp) - gcp: - # GCP project ID where the GKE cluster is running - # This is used for Compute Engine API calls - project: "" - # GCP zone where the GKE cluster nodes are located - # Example: us-central1-a, us-west1-b, europe-west1-c - zone: "" - # GCP Service Account email for Workload Identity (without @project.iam.gserviceaccount.com) - # Example: "nvsentinel-janitor" (will be formatted as nvsentinel-janitor@project-id.iam.gserviceaccount.com) - # This service account must have compute.instances.reset permission - serviceAccount: "" - # Note: GCP credentials are typically provided via: - # - Workload Identity - RECOMMENDED for GKE - # Requires project and serviceAccount to be set - # The GCP SA must have roles/compute.instanceAdmin.v1 or compute.instances.reset permission - # Must bind Kubernetes SA to GCP SA: gcloud iam service-accounts add-iam-policy-binding - - # Microsoft Azure specific configuration (only required when provider=azure) - azure: - # Azure subscription ID containing the AKS cluster - # Required for Azure Resource Manager API calls - subscriptionId: "" - # Azure resource group name containing the AKS cluster - # This can often be auto-detected from node provider IDs - resourceGroup: "" - # Azure region/location where the AKS cluster is deployed - # Example: eastus, westus2, westeurope - location: "" - # Azure Managed Identity Client ID for Workload Identity - # Example: "12345678-1234-1234-1234-123456789012" - # This managed identity must have Virtual Machine Contributor role or restart permission - clientId: "" - # Note: Azure credentials are typically provided via: - # - Managed Identity (Workload Identity) - RECOMMENDED for AKS - # Requires clientId to be set - # The managed identity must have Microsoft.Compute/virtualMachines/restart/action permission - # Must create federated identity credential for the Kubernetes service account - - # Oracle Cloud Infrastructure (OCI) specific configuration (only required when provider=oci) - oci: - # OCI region where the OKE cluster is running - # Example: us-ashburn-1, us-phoenix-1, uk-london-1 - region: "" - # OCI compartment OCID containing the cluster resources - # Format: ocid1.compartment.oc1..aaa... - compartment: "" - # Path to OCI credentials file (optional, uses workload identity by default) - credentialsFile: "" - # OCI profile name in credentials file (default: DEFAULT) - profile: "DEFAULT" - # OCI Principal OCID for Workload Identity - # Format: ocid1.principal.oc1..aaa... - # This is the OCID of the dynamic group or service principal - principalId: "" - # Note: OCI credentials are typically provided via: - # - OKE Workload Identity - RECOMMENDED for OKE (default) - # Requires compartment and principalId to be set - # The principal must have manage instance-family permission in compartment - # Must create dynamic group with matching rule for the pod - - # Nebius Managed Kubernetes (MK8s) specific configuration (only required when provider=nebius) - # Uses official Nebius Go SDK: github.com/nebius/gosdk - nebius: - # Path to service account key file (recommended for production) - # JSON format as documented at: https://github.com/nebius/gosdk#using-a-json-credentials-file - # Example: "/etc/nebius/sa-credentials.json" - # Set via NEBIUS_SA_KEY_FILE environment variable - serviceAccountKeyFile: "" - # Secret containing service account key (recommended for production with Kubernetes) - # The secret should contain the key file as 'sa-credentials.json' - # When set, automatically mounts at /etc/nebius and sets NEBIUS_SA_KEY_FILE - serviceAccountKeySecret: "" - # Direct IAM token (for testing only) - # Tokens expire and require manual refresh - not recommended for production - # Set via NEBIUS_IAM_TOKEN environment variable - # Obtain token: nebius iam get-access-token - iamToken: "" - # Reference to secret containing IAM token (alternative to iamToken value) - # Example: - # iamTokenSecretRef: - # name: nebius-janitor-token - # key: token - iamTokenSecretRef: {} - # Note: Nebius credentials are provided via environment variables: - # - NEBIUS_SA_KEY_FILE: Path to service account credentials JSON file (recommended) - # The service account must have compute.instances.stop and compute.instances.start permissions - # - NEBIUS_IAM_TOKEN: Direct IAM token (for testing only) - # Webhook Configuration webhook: # Port for the webhook server diff --git a/distros/kubernetes/nvsentinel/values-tilt.yaml b/distros/kubernetes/nvsentinel/values-tilt.yaml index ae18140d5..0856f8112 100755 --- a/distros/kubernetes/nvsentinel/values-tilt.yaml +++ b/distros/kubernetes/nvsentinel/values-tilt.yaml @@ -77,6 +77,9 @@ global: janitor: enabled: true + janitorProvider: + enabled: true + mongodbStore: enabled: true @@ -318,9 +321,14 @@ health-events-analyzer: topologyKey: kubernetes.io/hostname janitor: + config: + cspProviderHost: "janitor-provider.nvsentinel.svc.cluster.local:50051" webhook: certIssuer: "janitor-selfsigned-issuer" +janitorProvider: + csp: kind + labeler: logLevel: debug # Test kata label override with the annotation present on kata test nodes diff --git a/distros/kubernetes/nvsentinel/values.yaml b/distros/kubernetes/nvsentinel/values.yaml index 57a26581d..404e770f1 100644 --- a/distros/kubernetes/nvsentinel/values.yaml +++ b/distros/kubernetes/nvsentinel/values.yaml @@ -70,6 +70,8 @@ global: enabled: false janitor: enabled: false + janitorProvider: + enabled: false cspHealthMonitor: enabled: false syslogHealthMonitor: diff --git a/janitor-provider/Tiltfile b/janitor-provider/Tiltfile new file mode 100644 index 000000000..136c76afe --- /dev/null +++ b/janitor-provider/Tiltfile @@ -0,0 +1,21 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Build and deploy +custom_build( + 'ghcr.io/nvidia/nvsentinel/janitor-provider', + '../scripts/ko-tilt-build.sh . $EXPECTED_REF', + deps=['./', '../store-client'], + skips_local_docker=True +) diff --git a/janitor-provider/go.mod b/janitor-provider/go.mod new file mode 100644 index 000000000..745b4a506 --- /dev/null +++ b/janitor-provider/go.mod @@ -0,0 +1,127 @@ +module github.com/nvidia/nvsentinel/janitor-provider + +go 1.25.0 + +require ( + cloud.google.com/go/compute v1.52.0 + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0 + github.com/aws/aws-sdk-go-v2/config v1.32.6 + github.com/aws/aws-sdk-go-v2/service/ec2 v1.278.0 + github.com/nebius/gosdk v0.0.0-20251223093836-10eca9c65821 + github.com/nvidia/nvsentinel/api v0.0.0-00010101000000-000000000000 + github.com/nvidia/nvsentinel/commons v0.0.0 + github.com/oracle/oci-go-sdk/v65 v65.105.2 + github.com/stretchr/testify v1.11.1 + golang.org/x/oauth2 v0.34.0 + golang.org/x/sync v0.19.0 + google.golang.org/api v0.258.0 + google.golang.org/grpc v1.78.0 + k8s.io/api v0.35.0 + k8s.io/apimachinery v0.35.0 + k8s.io/client-go v0.35.0 +) + +require ( + buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.4-20250130201111-63bb56e20495.1 // indirect + cloud.google.com/go/auth v0.17.0 // indirect + cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect + cloud.google.com/go/compute/metadata v0.9.0 // indirect + github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect + github.com/aws/aws-sdk-go-v2 v1.41.0 // indirect + github.com/aws/aws-sdk-go-v2/credentials v1.19.6 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 // indirect + github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16 // indirect + github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.30.8 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.41.5 // indirect + github.com/aws/smithy-go v1.24.0 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-openapi/jsonpointer v0.22.3 // indirect + github.com/go-openapi/jsonreference v0.21.3 // indirect + github.com/go-openapi/swag v0.25.4 // indirect + github.com/go-openapi/swag/cmdutils v0.25.4 // indirect + github.com/go-openapi/swag/conv v0.25.4 // indirect + github.com/go-openapi/swag/fileutils v0.25.4 // indirect + github.com/go-openapi/swag/jsonname v0.25.4 // indirect + github.com/go-openapi/swag/jsonutils v0.25.4 // indirect + github.com/go-openapi/swag/loading v0.25.4 // indirect + github.com/go-openapi/swag/mangling v0.25.4 // indirect + github.com/go-openapi/swag/netutils v0.25.4 // indirect + github.com/go-openapi/swag/stringutils v0.25.4 // indirect + github.com/go-openapi/swag/typeutils v0.25.4 // indirect + github.com/go-openapi/swag/yamlutils v0.25.4 // indirect + github.com/gofrs/flock v0.12.1 // indirect + github.com/golang-jwt/jwt/v4 v4.5.1 // indirect + github.com/golang-jwt/jwt/v5 v5.3.0 // indirect + github.com/google/gnostic-models v0.7.1 // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/google/s2a-go v0.1.9 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.3.7 // indirect + github.com/googleapis/gax-go/v2 v2.15.0 // indirect + github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.67.4 // indirect + github.com/prometheus/procfs v0.19.2 // indirect + github.com/sony/gobreaker v0.5.0 // indirect + github.com/x448/float16 v0.8.4 // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect + go.opentelemetry.io/otel v1.38.0 // indirect + go.opentelemetry.io/otel/metric v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/crypto v0.46.0 // indirect + golang.org/x/exp v0.0.0-20251125195548-87e1e737ad39 // indirect + golang.org/x/net v0.48.0 // indirect + golang.org/x/sys v0.39.0 // indirect + golang.org/x/term v0.38.0 // indirect + golang.org/x/text v0.32.0 // indirect + golang.org/x/time v0.14.0 // indirect + google.golang.org/genproto v0.0.0-20250603155806-513f23925822 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251124214823-79d6a2a48846 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251213004720-97cd9d5aeac2 // indirect + google.golang.org/protobuf v1.36.11 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/klog/v2 v2.130.1 // indirect + k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e // indirect + k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect + sigs.k8s.io/controller-runtime v0.22.4 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect +) + +replace github.com/nvidia/nvsentinel/api => ../api + +replace github.com/nvidia/nvsentinel/commons => ../commons diff --git a/janitor-provider/go.sum b/janitor-provider/go.sum new file mode 100644 index 000000000..162323593 --- /dev/null +++ b/janitor-provider/go.sum @@ -0,0 +1,293 @@ +buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.4-20250130201111-63bb56e20495.1 h1:4erM3WLgEG/HIBrpBDmRbs1puhd7p0z7kNXDuhHthwM= +buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.4-20250130201111-63bb56e20495.1/go.mod h1:novQBstnxcGpfKf8qGRATqn1anQKwMJIbH5Q581jibU= +cloud.google.com/go v0.123.0 h1:2NAUJwPR47q+E35uaJeYoNhuNEM9kM8SjgRgdeOJUSE= +cloud.google.com/go v0.123.0/go.mod h1:xBoMV08QcqUGuPW65Qfm1o9Y4zKZBpGS+7bImXLTAZU= +cloud.google.com/go/auth v0.17.0 h1:74yCm7hCj2rUyyAocqnFzsAYXgJhrG26XCFimrc/Kz4= +cloud.google.com/go/auth v0.17.0/go.mod h1:6wv/t5/6rOPAX4fJiRjKkJCvswLwdet7G8+UGXt7nCQ= +cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= +cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= +cloud.google.com/go/compute v1.52.0 h1:SxiTrTFR/QmDbZ0cfo/8VxK/NUZSwn92+iHLDSQ51Fo= +cloud.google.com/go/compute v1.52.0/go.mod h1:zdogTa7daHhEtEX92+S5IARtQmi/RNVPUfoI8Jhl8Do= +cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= +cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY= +github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0 h1:/Di3vB4sNeQ+7A8efjUVENvyB945Wruvstucqp7ZArg= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0/go.mod h1:gM3K25LQlsET3QR+4V74zxCsFAy0r6xMNN9n80SZn+4= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal v1.0.0 h1:lMW1lD/17LUA5z1XTURo7LcVG2ICBPlyMHjIUrcFZNQ= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal v1.0.0/go.mod h1:ceIuwmxDWptoW3eCqSXlnPsZFKh4X+R38dWPv7GS9Vs= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork v1.0.0 h1:nBy98uKOIfun5z6wx6jwWLrULcM0+cjBalBFZlEZ7CA= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork v1.0.0/go.mod h1:243D9iHbcQXoFUtgHJwL7gl2zx1aDuDMjvBZVGr2uW0= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.0.0 h1:ECsQtyERDVz3NP3kvDOTLvbQhqWp/x9EsGKtb4ogUr8= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.0.0/go.mod h1:s1tW/At+xHqjNFvWU4G0c0Qv33KOhvbGNj0RCTQDV8s= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= +github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= +github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/aws/aws-sdk-go-v2 v1.41.0 h1:tNvqh1s+v0vFYdA1xq0aOJH+Y5cRyZ5upu6roPgPKd4= +github.com/aws/aws-sdk-go-v2 v1.41.0/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0= +github.com/aws/aws-sdk-go-v2/config v1.32.6 h1:hFLBGUKjmLAekvi1evLi5hVvFQtSo3GYwi+Bx4lpJf8= +github.com/aws/aws-sdk-go-v2/config v1.32.6/go.mod h1:lcUL/gcd8WyjCrMnxez5OXkO3/rwcNmvfno62tnXNcI= +github.com/aws/aws-sdk-go-v2/credentials v1.19.6 h1:F9vWao2TwjV2MyiyVS+duza0NIRtAslgLUM0vTA1ZaE= +github.com/aws/aws-sdk-go-v2/credentials v1.19.6/go.mod h1:SgHzKjEVsdQr6Opor0ihgWtkWdfRAIwxYzSJ8O85VHY= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 h1:80+uETIWS1BqjnN9uJ0dBUaETh+P1XwFy5vwHwK5r9k= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16/go.mod h1:wOOsYuxYuB/7FlnVtzeBYRcjSRtQpAW0hCP7tIULMwo= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 h1:rgGwPzb82iBYSvHMHXc8h9mRoOUBZIGFgKb9qniaZZc= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16/go.mod h1:L/UxsGeKpGoIj6DxfhOWHWQ/kGKcd4I1VncE4++IyKA= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 h1:1jtGzuV7c82xnqOVfx2F0xmJcOw5374L7N6juGW6x6U= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16/go.mod h1:M2E5OQf+XLe+SZGmmpaI2yy+J326aFf6/+54PoxSANc= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.278.0 h1:Cx/Rs2zaG30Dn4QMvUGC5rCAZagA8heta0TWAdBE/Xc= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.278.0/go.mod h1:Wg68QRgy2gEGGdmTPU/UbVpdv8sM14bUZmF64KFwAsY= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 h1:0ryTNEdJbzUCEWkVXEXoqlXV72J5keC1GvILMOuD00E= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4/go.mod h1:HQ4qwNZh32C3CBeO6iJLQlgtMzqeG17ziAA/3KDJFow= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16 h1:oHjJHeUy0ImIV0bsrX0X91GkV5nJAyv1l1CC9lnO0TI= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16/go.mod h1:iRSNGgOYmiYwSCXxXaKb9HfOEj40+oTKn8pTxMlYkRM= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 h1:HpI7aMmJ+mm1wkSHIA2t5EaFFv5EFYXePW30p1EIrbQ= +github.com/aws/aws-sdk-go-v2/service/signin v1.0.4/go.mod h1:C5RdGMYGlfM0gYq/tifqgn4EbyX99V15P2V3R+VHbQU= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.8 h1:aM/Q24rIlS3bRAhTyFurowU8A0SMyGDtEOY/l/s/1Uw= +github.com/aws/aws-sdk-go-v2/service/sso v1.30.8/go.mod h1:+fWt2UHSb4kS7Pu8y+BMBvJF0EWx+4H0hzNwtDNRTrg= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 h1:AHDr0DaHIAo8c9t1emrzAlVDFp+iMMKnPdYy6XO4MCE= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12/go.mod h1:GQ73XawFFiWxyWXMHWfhiomvP3tXtdNar/fi8z18sx0= +github.com/aws/aws-sdk-go-v2/service/sts v1.41.5 h1:SciGFVNZ4mHdm7gpD1dgZYnCuVdX1s+lFTg4+4DOy70= +github.com/aws/aws-sdk-go-v2/service/sts v1.41.5/go.mod h1:iW40X4QBmUxdP+fZNOpfmkdMZqsovezbAeO+Ubiv2pk= +github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk= +github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-openapi/jsonpointer v0.22.3 h1:dKMwfV4fmt6Ah90zloTbUKWMD+0he+12XYAsPotrkn8= +github.com/go-openapi/jsonpointer v0.22.3/go.mod h1:0lBbqeRsQ5lIanv3LHZBrmRGHLHcQoOXQnf88fHlGWo= +github.com/go-openapi/jsonreference v0.21.3 h1:96Dn+MRPa0nYAR8DR1E03SblB5FJvh7W6krPI0Z7qMc= +github.com/go-openapi/jsonreference v0.21.3/go.mod h1:RqkUP0MrLf37HqxZxrIAtTWW4ZJIK1VzduhXYBEeGc4= +github.com/go-openapi/swag v0.25.4 h1:OyUPUFYDPDBMkqyxOTkqDYFnrhuhi9NR6QVUvIochMU= +github.com/go-openapi/swag v0.25.4/go.mod h1:zNfJ9WZABGHCFg2RnY0S4IOkAcVTzJ6z2Bi+Q4i6qFQ= +github.com/go-openapi/swag/cmdutils v0.25.4 h1:8rYhB5n6WawR192/BfUu2iVlxqVR9aRgGJP6WaBoW+4= +github.com/go-openapi/swag/cmdutils v0.25.4/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= +github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4= +github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU= +github.com/go-openapi/swag/fileutils v0.25.4 h1:2oI0XNW5y6UWZTC7vAxC8hmsK/tOkWXHJQH4lKjqw+Y= +github.com/go-openapi/swag/fileutils v0.25.4/go.mod h1:cdOT/PKbwcysVQ9Tpr0q20lQKH7MGhOEb6EwmHOirUk= +github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI= +github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag= +github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA= +github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM= +github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s= +github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE= +github.com/go-openapi/swag/mangling v0.25.4 h1:2b9kBJk9JvPgxr36V23FxJLdwBrpijI26Bx5JH4Hp48= +github.com/go-openapi/swag/mangling v0.25.4/go.mod h1:6dxwu6QyORHpIIApsdZgb6wBk/DPU15MdyYj/ikn0Hg= +github.com/go-openapi/swag/netutils v0.25.4 h1:Gqe6K71bGRb3ZQLusdI8p/y1KLgV4M/k+/HzVSqT8H0= +github.com/go-openapi/swag/netutils v0.25.4/go.mod h1:m2W8dtdaoX7oj9rEttLyTeEFFEBvnAx9qHd5nJEBzYg= +github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8= +github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0= +github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw= +github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE= +github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw= +github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg= +github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls= +github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/gofrs/flock v0.12.1 h1:MTLVXXHf8ekldpJk3AKicLij9MdwOWkZ+a/jHHZby9E= +github.com/gofrs/flock v0.12.1/go.mod h1:9zxTsyu5xtJ9DK+1tFZyibEV7y3uwDxPPfbxeeHCoD0= +github.com/golang-jwt/jwt/v4 v4.5.1 h1:JdqV9zKUdtaa9gdPlywC3aeoEsR681PlKC+4F5gQgeo= +github.com/golang-jwt/jwt/v4 v4.5.1/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= +github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= +github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/gnostic-models v0.7.1 h1:SisTfuFKJSKM5CPZkffwi6coztzzeYUhc3v4yxLWH8c= +github.com/google/gnostic-models v0.7.1/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20251114195745-4902fdda35c8 h1:3DsUAV+VNEQa2CUVLxCY3f87278uWfIDhJnbdvDjvmE= +github.com/google/pprof v0.0.0-20251114195745-4902fdda35c8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= +github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= +github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/enterprise-certificate-proxy v0.3.7 h1:zrn2Ee/nWmHulBx5sAVrGgAa0f2/R35S4DJwfFaUPFQ= +github.com/googleapis/enterprise-certificate-proxy v0.3.7/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= +github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo= +github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0 h1:FbSCl+KggFl+Ocym490i/EyXF4lPgLoUtcSWquBM0Rs= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0/go.mod h1:qOchhhIlmRcqk/O9uCo/puJlyo07YINaIqdZfZG3Jkc= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= +github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/nebius/gosdk v0.0.0-20251223093836-10eca9c65821 h1:t5+gjlDh2BmUQm9s+HFl+aawdW7XD53AylBYnfO1Qwc= +github.com/nebius/gosdk v0.0.0-20251223093836-10eca9c65821/go.mod h1:8r4EhhGJ+RMUfdiVVpZ8pEb0b+O7hLG8JXDAgGyu89o= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= +github.com/oracle/oci-go-sdk/v65 v65.105.2 h1:AvZ59xNCGy/b4QT8j2HzIbE75K2nxYGeNirj7wX1XUw= +github.com/oracle/oci-go-sdk/v65 v65.105.2/go.mod h1:8ZzvzuEG/cFLFZhxg/Mg1w19KqyXBKO3c17QIc5PkGs= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= +github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc= +github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI= +github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= +github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/sony/gobreaker v0.5.0 h1:dRCvqm0P490vZPmy7ppEk2qCnCieBooFJ+YoXGYB+yg= +github.com/sony/gobreaker v0.5.0/go.mod h1:ZKptC7FHNvhBz7dN2LGjPVBz2sZJmc0/PkyDJOjmxWY= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU= +golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0= +golang.org/x/exp v0.0.0-20251125195548-87e1e737ad39 h1:DHNhtq3sNNzrvduZZIiFyXWOL9IWaDPHqTnLJp+rCBY= +golang.org/x/exp v0.0.0-20251125195548-87e1e737ad39/go.mod h1:46edojNIoXTNOhySWIWdix628clX9ODXwPsQuG6hsK0= +golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk= +golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc= +golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= +golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= +golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= +golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q= +golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg= +golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= +golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= +golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= +golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= +golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= +golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/api v0.258.0 h1:IKo1j5FBlN74fe5isA2PVozN3Y5pwNKriEgAXPOkDAc= +google.golang.org/api v0.258.0/go.mod h1:qhOMTQEZ6lUps63ZNq9jhODswwjkjYYguA7fA3TBFww= +google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4= +google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s= +google.golang.org/genproto/googleapis/api v0.0.0-20251124214823-79d6a2a48846 h1:ZdyUkS9po3H7G0tuh955QVyyotWvOD4W0aEapeGeUYk= +google.golang.org/genproto/googleapis/api v0.0.0-20251124214823-79d6a2a48846/go.mod h1:Fk4kyraUvqD7i5H6S43sj2W98fbZa75lpZz/eUyhfO0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251213004720-97cd9d5aeac2 h1:2I6GHUeJ/4shcDpoUlLs/2WPnhg7yJwvXtqcMJt9liA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251213004720-97cd9d5aeac2/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc= +google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= +gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= +k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= +k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= +k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= +k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e h1:iW9ChlU0cU16w8MpVYjXk12dqQ4BPFBEgif+ap7/hqQ= +k8s.io/kube-openapi v0.0.0-20251125145642-4e65d59e963e/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= +k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= +sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.1 h1:JrhdFMqOd/+3ByqlP2I45kTOZmTRLBUm5pvRjeheg7E= +sigs.k8s.io/structured-merge-diff/v6 v6.3.1/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/janitor-provider/main.go b/janitor-provider/main.go new file mode 100644 index 000000000..ef091c876 --- /dev/null +++ b/janitor-provider/main.go @@ -0,0 +1,187 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package main implements the janitor-provider gRPC service that provides +// cloud service provider operations for node lifecycle management including +// reboot signals, readiness checks, and termination signals. +package main + +import ( + "context" + "fmt" + "log/slog" + "net" + "os" + "os/signal" + "strconv" + "syscall" + + "golang.org/x/sync/errgroup" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" + "github.com/nvidia/nvsentinel/commons/pkg/logger" + "github.com/nvidia/nvsentinel/commons/pkg/server" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/csp" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/model" +) + +var ( + // These variables will be populated during the build process + version = "dev" + commit = "none" + date = "unknown" +) + +type janitorProviderServer struct { + cspv1alpha1.UnimplementedCSPProviderServiceServer + cspClient model.CSPClient + k8sClient kubernetes.Interface +} + +func (s *janitorProviderServer) SendRebootSignal(ctx context.Context, req *cspv1alpha1.SendRebootSignalRequest) (*cspv1alpha1.SendRebootSignalResponse, error) { + slog.Info("Sending reboot signal", "node", req.NodeName) + node, err := s.k8sClient.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{}) + if err != nil { + return nil, status.Errorf(codes.Internal, "failed to get node: %v", err) + } + requestID, err := s.cspClient.SendRebootSignal(ctx, *node) + if err != nil { + return nil, status.Errorf(codes.Internal, "failed to send reboot signal: %v", err) + } + return &cspv1alpha1.SendRebootSignalResponse{ + RequestId: string(requestID), + }, nil +} + +func (s *janitorProviderServer) IsNodeReady(ctx context.Context, req *cspv1alpha1.IsNodeReadyRequest) (*cspv1alpha1.IsNodeReadyResponse, error) { + slog.Info("Checking if node is ready", "node", req.NodeName) + node, err := s.k8sClient.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{}) + if err != nil { + return nil, status.Errorf(codes.Internal, "failed to get node: %v", err) + } + isReady, err := s.cspClient.IsNodeReady(ctx, *node, req.RequestId) + if err != nil { + return nil, status.Errorf(codes.Internal, "failed to check if node is ready: %v", err) + } + return &cspv1alpha1.IsNodeReadyResponse{ + IsReady: isReady, + }, nil +} + +func (s *janitorProviderServer) SendTerminateSignal(ctx context.Context, req *cspv1alpha1.SendTerminateSignalRequest) (*cspv1alpha1.SendTerminateSignalResponse, error) { + slog.Info("Sending terminate signal", "node", req.NodeName) + node, err := s.k8sClient.CoreV1().Nodes().Get(ctx, req.NodeName, metav1.GetOptions{}) + if err != nil { + return nil, status.Errorf(codes.Internal, "failed to get node: %v", err) + } + requestID, err := s.cspClient.SendTerminateSignal(ctx, *node) + if err != nil { + return nil, status.Errorf(codes.Internal, "failed to send terminate signal: %v", err) + } + return &cspv1alpha1.SendTerminateSignalResponse{ + RequestId: string(requestID), + }, nil +} + +func main() { + logger.SetDefaultStructuredLogger("janitor-provider", version) + slog.Info("Starting janitor-provider", "version", version, "commit", commit, "date", date) + + if err := run(); err != nil { + slog.Error("Failed to run", "error", err) + os.Exit(1) + } +} + +func run() error { + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + lis, err := net.Listen("tcp", fmt.Sprintf(":%s", os.Getenv("JANITOR_PROVIDER_PORT"))) + if err != nil { + return fmt.Errorf("failed to listen: %w", err) + } + + k8sRestConfig, err := rest.InClusterConfig() + if err != nil { + return fmt.Errorf("failed to create kubernetes clientset: %w", err) + } + + k8sClient, err := kubernetes.NewForConfig(k8sRestConfig) + if err != nil { + return fmt.Errorf("failed to create kubernetes client: %w", err) + } + + metricsPort, err := strconv.Atoi(os.Getenv("METRICS_PORT")) + if err != nil { + return fmt.Errorf("failed to convert metrics port to int: %w", err) + } + + srv := server.NewServer( + server.WithPort(metricsPort), + server.WithPrometheusMetrics(), + server.WithSimpleHealth(), + ) + + cspClient, err := csp.New(ctx) + if err != nil { + return fmt.Errorf("failed to create csp client: %w", err) + } + + svr := grpc.NewServer() + cspv1alpha1.RegisterCSPProviderServiceServer(svr, &janitorProviderServer{ + cspClient: cspClient, + k8sClient: k8sClient, + }) + + g, gCtx := errgroup.WithContext(ctx) + + // Metrics server failures are logged but do NOT terminate the service + g.Go(func() error { + slog.Info("Starting metrics server", "port", metricsPort) + + if err := srv.Serve(gCtx); err != nil { + slog.Error("Metrics server failed - continuing without metrics", "error", err) + } + + return nil + }) + + g.Go(func() error { + slog.Info("Starting gRPC server", "port", os.Getenv("JANITOR_PROVIDER_PORT")) + + if err := svr.Serve(lis); err != nil { + return fmt.Errorf("failed to serve gRPC: %w", err) + } + + return nil + }) + + // Graceful shutdown on context cancellation + g.Go(func() error { + <-gCtx.Done() + slog.Info("Shutting down gRPC server") + svr.GracefulStop() + + return nil + }) + + return g.Wait() +} diff --git a/janitor/pkg/csp/aws/aws.go b/janitor-provider/pkg/csp/aws/aws.go similarity index 90% rename from janitor/pkg/csp/aws/aws.go rename to janitor-provider/pkg/csp/aws/aws.go index 9ca2ae111..456fc0698 100644 --- a/janitor/pkg/csp/aws/aws.go +++ b/janitor-provider/pkg/csp/aws/aws.go @@ -17,6 +17,7 @@ package aws import ( "context" "fmt" + "log/slog" "net/http" "os" "strings" @@ -25,10 +26,9 @@ import ( "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/ec2" corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/controller-runtime/pkg/log" "github.com/nvidia/nvsentinel/commons/pkg/auditlogger" - "github.com/nvidia/nvsentinel/janitor/pkg/model" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/model" ) var ( @@ -97,13 +97,11 @@ func WithEC2Client(ctx context.Context) ClientOptionFunc { // SendRebootSignal sends a reboot signal to AWS EC2 for the given node. func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model.ResetSignalRequestRef, error) { - logger := log.FromContext(ctx) - // Fetch the node's provider ID providerID := node.Spec.ProviderID if providerID == "" { err := fmt.Errorf("no provider ID found for node %s", node.Name) - logger.Error(err, "Failed to reboot node") + slog.Error("Failed to reboot node", "error", err) return "", err } @@ -111,19 +109,19 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. // Extract the instance ID from the provider ID instanceID, err := parseAWSProviderID(providerID) if err != nil { - logger.Error(err, "Failed to parse provider ID") + slog.Error("Failed to parse provider ID", "error", err) return "", err } // Reboot the EC2 instance - logger.Info(fmt.Sprintf("Rebooting node %s (Instance ID: %s)", node.Name, instanceID)) + slog.Info("Rebooting node", "node", node.Name, "instanceID", instanceID) _, err = c.ec2.RebootInstances(ctx, &ec2.RebootInstancesInput{ InstanceIds: []string{instanceID}, }) if err != nil { - logger.Error(err, fmt.Sprintf("Failed to reboot instance %s: %s", instanceID, err)) + slog.Error("Failed to reboot instance", "error", err, "instanceID", instanceID) return "", err } @@ -133,11 +131,11 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. // IsNodeReady checks if the node is ready after a reboot signal was sent. // AWS requires a 5-minute cooldown period before the node status is reliable. -func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message string) (bool, error) { +func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, requestID string) (bool, error) { // Sending a reboot request to AWS doesn't update statuses immediately, // the ec2 instance does not report that it isn't in a running state for some time // and kubernetes still sees the node as ready. Wait five minutes before checking the status - storedTime, err := time.Parse(time.RFC3339, message) + storedTime, err := time.Parse(time.RFC3339, requestID) if err != nil { return false, err } diff --git a/janitor/pkg/csp/azure/azure.go b/janitor-provider/pkg/csp/azure/azure.go similarity index 87% rename from janitor/pkg/csp/azure/azure.go rename to janitor-provider/pkg/csp/azure/azure.go index d434b6ba5..6e39e61bd 100755 --- a/janitor/pkg/csp/azure/azure.go +++ b/janitor-provider/pkg/csp/azure/azure.go @@ -18,6 +18,7 @@ import ( "context" "encoding/json" "fmt" + "log/slog" "net/http" "os" "strings" @@ -29,10 +30,9 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azidentity" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute" corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/controller-runtime/pkg/log" "github.com/nvidia/nvsentinel/commons/pkg/auditlogger" - "github.com/nvidia/nvsentinel/janitor/pkg/model" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/model" ) var ( @@ -72,12 +72,10 @@ func NewClient(ctx context.Context) (*Client, error) { // SendRebootSignal sends a reboot signal to Azure for the node. func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model.ResetSignalRequestRef, error) { - logger := log.FromContext(ctx) - // Get the Azure client vmssClient, err := c.getVMSSClient(ctx) if err != nil { - logger.Error(err, "Failed to create Azure client") + slog.Error("Failed to create Azure client", "error", err) return "", err } @@ -85,7 +83,7 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. providerID := node.Spec.ProviderID if providerID == "" { err := fmt.Errorf("no provider ID found for node %s", node.Name) - logger.Error(err, "Failed to reboot node") + slog.Error("Failed to reboot node", "error", err) return "", err } @@ -93,14 +91,14 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. // Extract the resource group and VM name from the provider ID resourceGroup, vmName, instanceID, err := parseAzureProviderID(providerID) if err != nil { - logger.Error(err, "Failed to parse provider ID") + slog.Error("Failed to parse provider ID", "error", err) return "", err } // Reboot the VM _, err = vmssClient.BeginRestart(ctx, resourceGroup, vmName, instanceID, nil) if err != nil { - logger.Error(err, fmt.Sprintf("Failed to send restart signal to node %s: %s", vmName, err)) + slog.Error("Failed to send restart signal to node", "error", err, "node", vmName) return "", err } @@ -108,11 +106,9 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. } // IsNodeReady checks if the node is ready after a reboot operation. -func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message string) (bool, error) { - logger := log.FromContext(ctx) - +func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, requestID string) (bool, error) { // don't check too early, wait like 5 minutes before checking, return not ready if too early - storedTime, err := time.Parse(time.RFC3339, message) + storedTime, err := time.Parse(time.RFC3339, requestID) if err != nil { return false, err } @@ -125,7 +121,7 @@ func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message stri providerID := node.Spec.ProviderID if providerID == "" { err := fmt.Errorf("no provider ID found for node %s", node.Name) - logger.Error(err, "Failed to reboot node") + slog.Error("Failed to reboot node", "error", err) return false, err } @@ -133,27 +129,27 @@ func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message stri // Extract the resource group and VM name from the provider ID resourceGroup, vmName, instanceID, err := parseAzureProviderID(providerID) if err != nil { - logger.Error(err, "Failed to parse provider ID") + slog.Error("Failed to parse provider ID", "error", err) return false, err } // Get the Azure client vmssClient, err := c.getVMSSClient(ctx) if err != nil { - logger.Error(err, "Failed to create Azure client") + slog.Error("Failed to create Azure client", "error", err) return false, err } instanceView, err := vmssClient.GetInstanceView(ctx, resourceGroup, vmName, instanceID, nil) if err != nil { - logger.Error(err, fmt.Sprintf("Failed to get instance view for VM %s: %s", vmName, err)) + slog.Error("Failed to get instance view for VM", "error", err, "node", vmName) return false, err } if instanceView.Statuses != nil { for _, status := range instanceView.Statuses { if *status.Code == "ProvisioningState/succeeded" { - logger.Info(fmt.Sprintf("Node %s is in a healthy state", node.Name)) + slog.Info(fmt.Sprintf("Node %s is in a healthy state", node.Name)) return true, nil } } @@ -195,8 +191,6 @@ func (c *Client) getVMSSClient(ctx context.Context) (VMSSClientInterface, error) } func createDefaultVMSSClient(ctx context.Context) (VMSSClientInterface, error) { - logger := log.FromContext(ctx) - // Get the Azure subscription ID from environment variable or IMDS subscriptionID, err := getSubscriptionID(ctx) if err != nil { @@ -205,7 +199,7 @@ func createDefaultVMSSClient(ctx context.Context) (VMSSClientInterface, error) { cred, err := azidentity.NewDefaultAzureCredential(nil) if err != nil { - logger.Error(err, "Failed to create Azure credential") + slog.Error("Failed to create Azure credential", "error", err) return nil, err } @@ -217,7 +211,7 @@ func createDefaultVMSSClient(ctx context.Context) (VMSSClientInterface, error) { }, }) if err != nil { - logger.Error(err, "Failed to create Azure client") + slog.Error("Failed to create Azure client", "error", err) return nil, err } @@ -225,8 +219,6 @@ func createDefaultVMSSClient(ctx context.Context) (VMSSClientInterface, error) { } func getSubscriptionID(ctx context.Context) (string, error) { - logger := log.FromContext(ctx) - if os.Getenv("LOCAL") == "true" { subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID") if subscriptionID == "" { @@ -263,7 +255,7 @@ func getSubscriptionID(ctx context.Context) (string, error) { defer func() { if cerr := resp.Body.Close(); cerr != nil { - logger.Error(cerr, "failed to close http client") + slog.Error("failed to close IMDS response body", "error", cerr) } }() diff --git a/janitor/pkg/csp/client.go b/janitor-provider/pkg/csp/client.go similarity index 81% rename from janitor/pkg/csp/client.go rename to janitor-provider/pkg/csp/client.go index eb8d422d4..756da92bd 100644 --- a/janitor/pkg/csp/client.go +++ b/janitor-provider/pkg/csp/client.go @@ -17,18 +17,17 @@ package csp import ( "context" "fmt" + "log/slog" "os" "strings" - "sigs.k8s.io/controller-runtime/pkg/log" - - "github.com/nvidia/nvsentinel/janitor/pkg/csp/aws" - "github.com/nvidia/nvsentinel/janitor/pkg/csp/azure" - "github.com/nvidia/nvsentinel/janitor/pkg/csp/gcp" - "github.com/nvidia/nvsentinel/janitor/pkg/csp/kind" - "github.com/nvidia/nvsentinel/janitor/pkg/csp/nebius" - "github.com/nvidia/nvsentinel/janitor/pkg/csp/oci" - "github.com/nvidia/nvsentinel/janitor/pkg/model" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/csp/aws" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/csp/azure" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/csp/gcp" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/csp/kind" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/csp/nebius" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/csp/oci" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/model" ) const ( @@ -45,27 +44,25 @@ type Provider string // New creates a new CSP client based on the provider type from environment variables func New(ctx context.Context) (model.CSPClient, error) { - logger := log.FromContext(ctx) - provider, err := GetProviderFromEnv() if err != nil { - logger.Error(err, "failed to determine CSP provider from environment") + slog.Error("Failed to determine CSP provider from environment", "error", err) return nil, err } - logger.Info("initializing CSP client", + slog.Info("initializing CSP client", "provider", string(provider)) client, err := NewWithProvider(ctx, provider) if err != nil { - logger.Error(err, "failed to create CSP client", + slog.Error("Failed to create CSP client", "error", err, "provider", string(provider)) return nil, fmt.Errorf("creating %s client: %w", provider, err) } - logger.Info("CSP client initialized successfully", + slog.Info("CSP client initialized successfully", "provider", string(provider)) return client, nil diff --git a/janitor/pkg/csp/client_test.go b/janitor-provider/pkg/csp/client_test.go similarity index 100% rename from janitor/pkg/csp/client_test.go rename to janitor-provider/pkg/csp/client_test.go diff --git a/janitor/pkg/csp/gcp/gcp.go b/janitor-provider/pkg/csp/gcp/gcp.go similarity index 90% rename from janitor/pkg/csp/gcp/gcp.go rename to janitor-provider/pkg/csp/gcp/gcp.go index f42c69c28..7f26c55d5 100755 --- a/janitor/pkg/csp/gcp/gcp.go +++ b/janitor-provider/pkg/csp/gcp/gcp.go @@ -18,6 +18,7 @@ import ( "context" "errors" "fmt" + "log/slog" "net/http" "regexp" @@ -26,10 +27,9 @@ import ( "golang.org/x/oauth2/google" "google.golang.org/api/option" corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/controller-runtime/pkg/log" "github.com/nvidia/nvsentinel/commons/pkg/auditlogger" - "github.com/nvidia/nvsentinel/janitor/pkg/model" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/model" ) var ( @@ -107,8 +107,6 @@ func getNodeFields(node corev1.Node) (*gcpNodeFields, error) { // SendRebootSignal resets a GCE node by stopping and starting the instance. // nolint:dupl // Similar code pattern as SendTerminateSignal is expected for CSP operations func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model.ResetSignalRequestRef, error) { - logger := log.FromContext(ctx) - httpClient, err := getAuthenticatedHTTPClient(ctx) if err != nil { return "", err @@ -121,7 +119,7 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. defer func() { if cerr := instancesClient.Close(); cerr != nil { - logger.Error(cerr, "failed to close instances client") + slog.Error("failed to close instances client", "error", cerr) } }() @@ -136,7 +134,7 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. Zone: nodeFields.zone, } - logger.Info(fmt.Sprintf("Sending reset signal to %s", nodeFields.instance)) + slog.Info("Sending reset signal to", "node", nodeFields.instance) op, err := instancesClient.Reset(ctx, resetReq) if err != nil { @@ -147,9 +145,7 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. } // IsNodeReady checks if the node is ready after a reboot operation. -func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message string) (bool, error) { - logger := log.FromContext(ctx) - +func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, requestID string) (bool, error) { httpClient, err := getAuthenticatedHTTPClient(ctx) if err != nil { return false, err @@ -162,7 +158,7 @@ func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message stri defer func() { if cerr := zoneOperationsClient.Close(); cerr != nil { - logger.Error(cerr, "failed to close zone operations client") + slog.Error("failed to close zone operations client", "error", cerr) } }() @@ -172,7 +168,7 @@ func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message stri } req := &computepb.GetZoneOperationRequest{ - Operation: message, + Operation: requestID, Project: nodeFields.project, Zone: nodeFields.zone, } @@ -192,8 +188,6 @@ func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message stri // SendTerminateSignal deletes a GCE node. // nolint:dupl // Similar code pattern as SendRebootSignal is expected for CSP operations func (c *Client) SendTerminateSignal(ctx context.Context, node corev1.Node) (model.TerminateNodeRequestRef, error) { - logger := log.FromContext(ctx) - httpClient, err := getAuthenticatedHTTPClient(ctx) if err != nil { return "", err @@ -206,7 +200,7 @@ func (c *Client) SendTerminateSignal(ctx context.Context, node corev1.Node) (mod defer func() { if cerr := instancesClient.Close(); cerr != nil { - logger.Error(cerr, "failed to close instances client") + slog.Error("failed to close instances client", "error", cerr) } }() @@ -221,7 +215,7 @@ func (c *Client) SendTerminateSignal(ctx context.Context, node corev1.Node) (mod Zone: nodeFields.zone, } - logger.Info(fmt.Sprintf("Sending delete signal to %s", nodeFields.instance)) + slog.Info("Sending delete signal to", "node", nodeFields.instance) op, err := instancesClient.Delete(ctx, deleteReq) if err != nil { diff --git a/janitor/pkg/csp/kind/kind.go b/janitor-provider/pkg/csp/kind/kind.go similarity index 90% rename from janitor/pkg/csp/kind/kind.go rename to janitor-provider/pkg/csp/kind/kind.go index 3ec0e42bd..93ed5b959 100644 --- a/janitor/pkg/csp/kind/kind.go +++ b/janitor-provider/pkg/csp/kind/kind.go @@ -17,15 +17,15 @@ package kind import ( "context" "fmt" + "log/slog" "math/rand/v2" "os/exec" "strings" "time" corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/controller-runtime/pkg/log" - "github.com/nvidia/nvsentinel/janitor/pkg/model" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/model" ) var ( @@ -51,9 +51,12 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. } // IsNodeReady checks if the node is ready (simulated with randomness for kind) -func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message string) (bool, error) { +func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, requestID string) (bool, error) { // nolint:gosec // G404: Using weak random for simulation is acceptable // simulate some randomness if the node is ready or not (very high success rate for fast tests) + // requestID is unused in simulation mode + _ = requestID + return rand.IntN(100) > 5, nil } @@ -63,8 +66,6 @@ func (c *Client) SendTerminateSignal( ctx context.Context, node corev1.Node, ) (model.TerminateNodeRequestRef, error) { - logger := log.FromContext(ctx) - // Check if provider ID has the correct prefix if !strings.HasPrefix(node.Spec.ProviderID, "kind://") { return "", fmt.Errorf("invalid provider ID format: %s", node.Spec.ProviderID) @@ -79,7 +80,7 @@ func (c *Client) SendTerminateSignal( containerName := parts[len(parts)-1] clusterName := parts[3] - logger.Info("Attempting to terminate node", "node", node.Name, "container", containerName) + slog.Info("Attempting to terminate node", "node", node.Name, "container", containerName) // Create a timeout context for docker operations dockerCtx, cancel := context.WithTimeout(ctx, 30*time.Second) @@ -110,7 +111,7 @@ func (c *Client) SendTerminateSignal( // nolint:nestif // Complex docker interaction logic migrated from old code // If container exists, delete it if strings.Contains(string(output), containerName) { - logger.Info("Found container, attempting deletion", "container", containerName) + slog.Info("Found container, attempting deletion", "container", containerName) // nolint:gosec // G204: Command args are derived from kubernetes API, not user input cmd = exec.CommandContext(dockerCtx, "docker", "rm", "-f", containerName) @@ -150,9 +151,9 @@ func (c *Client) SendTerminateSignal( fmt.Errorf("container %s still exists after deletion attempt", containerName) } - logger.Info("Successfully deleted container", "container", containerName) + slog.Info("Successfully deleted container", "container", containerName) } else { - logger.Info("Container not found, assuming already deleted", "container", containerName) + slog.Info("Container not found, assuming already deleted", "container", containerName) } return model.TerminateNodeRequestRef(""), nil diff --git a/janitor/pkg/csp/nebius/nebius.go b/janitor-provider/pkg/csp/nebius/nebius.go similarity index 93% rename from janitor/pkg/csp/nebius/nebius.go rename to janitor-provider/pkg/csp/nebius/nebius.go index b7e48438e..f86ff9be4 100644 --- a/janitor/pkg/csp/nebius/nebius.go +++ b/janitor-provider/pkg/csp/nebius/nebius.go @@ -29,6 +29,7 @@ package nebius import ( "context" "fmt" + "log/slog" "os" "regexp" @@ -39,9 +40,8 @@ import ( computev1 "github.com/nebius/gosdk/services/nebius/compute/v1" "google.golang.org/grpc" corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/controller-runtime/pkg/log" - "github.com/nvidia/nvsentinel/janitor/pkg/model" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/model" ) var ( @@ -130,8 +130,6 @@ func (c *Client) getInstanceService(ctx context.Context) (InstanceService, func( // The instance will be started in IsNodeReady after the stop completes. // This is async - we don't wait for the stop to complete, just initiate it. func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model.ResetSignalRequestRef, error) { - logger := log.FromContext(ctx) - // Fetch the node's provider ID providerID := node.Spec.ProviderID if providerID == "" { @@ -144,7 +142,7 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. return "", fmt.Errorf("failed to parse provider ID: %w", err) } - logger.Info("Stopping Nebius instance for reboot", "instanceID", nodeFields.instanceID) + slog.Info("Stopping Nebius instance for reboot", "instanceID", nodeFields.instanceID) instanceService, cleanup, err := c.getInstanceService(ctx) if err != nil { @@ -169,11 +167,9 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. // - STOPPED -> initiate start // - STARTING -> wait for start to complete // - RUNNING -> node is ready -func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message string) (bool, error) { - logger := log.FromContext(ctx) - - // message contains the instance ID from SendRebootSignal - instanceID := message +func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, requestID string) (bool, error) { + // requestID contains the instance ID from SendRebootSignal + instanceID := requestID instanceService, cleanup, err := c.getInstanceService(ctx) if err != nil { @@ -193,12 +189,12 @@ func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message stri switch state { case compute.InstanceStatus_RUNNING: - logger.Info("Nebius instance is running", "instanceID", instanceID) + slog.Info("Nebius instance is running", "instanceID", instanceID) return true, nil case compute.InstanceStatus_STOPPED: - logger.Info("Starting Nebius instance", "instanceID", instanceID) + slog.Info("Starting Nebius instance", "instanceID", instanceID) _, err := instanceService.Start(ctx, &compute.StartInstanceRequest{ Id: instanceID, @@ -211,7 +207,7 @@ func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message stri return false, nil case compute.InstanceStatus_STOPPING, compute.InstanceStatus_STARTING: - logger.Info("Nebius instance is in transitional state, waiting", + slog.Info("Nebius instance is in transitional state, waiting", "instanceID", instanceID, "state", state.String()) return false, nil @@ -221,14 +217,14 @@ func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message stri compute.InstanceStatus_UPDATING, compute.InstanceStatus_DELETING, compute.InstanceStatus_ERROR: - logger.Info("Nebius instance is in unexpected state", + slog.Info("Nebius instance is in unexpected state", "instanceID", instanceID, "state", state.String()) return false, nil } // Fallback for any unhandled states (future-proofing) - logger.Info("Nebius instance is in unknown state", + slog.Info("Nebius instance is in unknown state", "instanceID", instanceID, "state", state.String()) return false, nil diff --git a/janitor/pkg/csp/oci/oci.go b/janitor-provider/pkg/csp/oci/oci.go similarity index 94% rename from janitor/pkg/csp/oci/oci.go rename to janitor-provider/pkg/csp/oci/oci.go index dca9e2caa..ebe3730c1 100755 --- a/janitor/pkg/csp/oci/oci.go +++ b/janitor-provider/pkg/csp/oci/oci.go @@ -17,6 +17,7 @@ package oci import ( "context" "fmt" + "log/slog" "net/http" "os" "time" @@ -25,10 +26,9 @@ import ( "github.com/oracle/oci-go-sdk/v65/common/auth" "github.com/oracle/oci-go-sdk/v65/core" corev1 "k8s.io/api/core/v1" - ctrllog "sigs.k8s.io/controller-runtime/pkg/log" "github.com/nvidia/nvsentinel/commons/pkg/auditlogger" - "github.com/nvidia/nvsentinel/janitor/pkg/model" + "github.com/nvidia/nvsentinel/janitor-provider/pkg/model" ) var ( @@ -132,15 +132,13 @@ func (c *Client) SendRebootSignal(ctx context.Context, node corev1.Node) (model. } // IsNodeReady checks if the node is ready after a reboot operation. -func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, message string) (bool, error) { - logger := ctrllog.FromContext(ctx) - +func (c *Client) IsNodeReady(ctx context.Context, node corev1.Node, requestID string) (bool, error) { // Sending a reboot request to OCI doesn't update statuses immediately, // the instance does not report that it isn't in a running state for some time // and kubernetes still sees the node as ready. Wait five minutes before checking the status - storedTime, err := time.Parse(time.RFC3339, message) + storedTime, err := time.Parse(time.RFC3339, requestID) if err != nil { - logger.Error(err, "error parsing time") + slog.Error("Failed to parse time", "error", err) return false, err } diff --git a/janitor/pkg/model/csp.go b/janitor-provider/pkg/model/csp.go similarity index 89% rename from janitor/pkg/model/csp.go rename to janitor-provider/pkg/model/csp.go index b0c9985fc..7cd623adf 100644 --- a/janitor/pkg/model/csp.go +++ b/janitor-provider/pkg/model/csp.go @@ -32,7 +32,8 @@ type CSPClient interface { SendRebootSignal(ctx context.Context, node corev1.Node) (ResetSignalRequestRef, error) // IsNodeReady checks if the node is ready after a reboot operation - IsNodeReady(ctx context.Context, node corev1.Node, message string) (bool, error) + // requestID is the reference returned by SendRebootSignal to track the operation + IsNodeReady(ctx context.Context, node corev1.Node, requestID string) (bool, error) // SendTerminateSignal sends a termination signal to the node via the CSP SendTerminateSignal(ctx context.Context, node corev1.Node) (TerminateNodeRequestRef, error) diff --git a/janitor/go.mod b/janitor/go.mod index 2ba3e4c67..6e815b8de 100644 --- a/janitor/go.mod +++ b/janitor/go.mod @@ -2,28 +2,17 @@ module github.com/nvidia/nvsentinel/janitor go 1.25.0 -toolchain go1.25.3 - require ( - cloud.google.com/go/compute v1.51.0 - github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 - github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 - github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0 - github.com/aws/aws-sdk-go-v2/config v1.32.5 - github.com/aws/aws-sdk-go-v2/service/ec2 v1.275.0 github.com/go-logr/logr v1.4.3 - github.com/nebius/gosdk v0.0.0-20251217093515-85d26a34ceb2 + github.com/nvidia/nvsentinel/api v0.0.0 github.com/nvidia/nvsentinel/commons v0.0.0 github.com/onsi/ginkgo/v2 v2.27.2 github.com/onsi/gomega v1.38.2 - github.com/oracle/oci-go-sdk/v65 v65.105.0 github.com/prometheus/client_golang v1.23.2 github.com/spf13/viper v1.21.0 github.com/stretchr/testify v1.11.1 - golang.org/x/oauth2 v0.33.0 golang.org/x/sync v0.19.0 - google.golang.org/api v0.257.0 - google.golang.org/grpc v1.77.0 + google.golang.org/grpc v1.78.0 k8s.io/api v0.35.0 k8s.io/apimachinery v0.35.0 k8s.io/client-go v0.35.0 @@ -31,28 +20,9 @@ require ( ) require ( - buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.4-20250130201111-63bb56e20495.1 // indirect cel.dev/expr v0.25.1 // indirect - cloud.google.com/go/auth v0.17.0 // indirect - cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect - cloud.google.com/go/compute/metadata v0.9.0 // indirect - github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect - github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect github.com/Masterminds/semver/v3 v3.4.0 // indirect github.com/antlr4-go/antlr/v4 v4.13.1 // indirect - github.com/aws/aws-sdk-go-v2 v1.41.0 // indirect - github.com/aws/aws-sdk-go-v2/credentials v1.19.5 // indirect - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 // indirect - github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16 // indirect - github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.30.7 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.41.5 // indirect - github.com/aws/smithy-go v1.24.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect @@ -82,36 +52,26 @@ require ( github.com/go-openapi/swag/yamlutils v0.25.4 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect - github.com/gofrs/flock v0.12.1 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang-jwt/jwt/v4 v4.5.1 // indirect - github.com/golang-jwt/jwt/v5 v5.3.0 // indirect github.com/google/btree v1.1.3 // indirect github.com/google/cel-go v0.26.1 // indirect github.com/google/gnostic-models v0.7.1 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20251114195745-4902fdda35c8 // indirect - github.com/google/s2a-go v0.1.9 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/googleapis/enterprise-certificate-proxy v0.3.7 // indirect - github.com/googleapis/gax-go/v2 v2.15.0 // indirect - github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.18.2 // indirect - github.com/kylelemons/godebug v1.1.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect - github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.4 // indirect github.com/prometheus/procfs v0.19.2 // indirect github.com/sagikazarmark/locafero v0.12.0 // indirect - github.com/sony/gobreaker v0.5.0 // indirect github.com/spf13/afero v1.15.0 // indirect github.com/spf13/cast v1.10.0 // indirect github.com/spf13/cobra v1.9.1 // indirect @@ -119,7 +79,6 @@ require ( github.com/stoewer/go-strcase v1.3.1 // indirect github.com/subosito/gotenv v1.6.0 // indirect github.com/x448/float16 v0.8.4 // indirect - github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect go.opentelemetry.io/otel v1.38.0 // indirect @@ -133,18 +92,17 @@ require ( go.uber.org/zap v1.27.1 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.45.0 // indirect golang.org/x/exp v0.0.0-20251125195548-87e1e737ad39 // indirect golang.org/x/mod v0.30.0 // indirect golang.org/x/net v0.47.0 // indirect + golang.org/x/oauth2 v0.33.0 // indirect golang.org/x/sys v0.38.0 // indirect golang.org/x/term v0.37.0 // indirect golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.14.0 // indirect golang.org/x/tools v0.39.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect - google.golang.org/genproto v0.0.0-20250603155806-513f23925822 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20251124214823-79d6a2a48846 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251111163417-95abcf5c77ba // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect @@ -170,3 +128,5 @@ replace github.com/nvidia/nvsentinel/store-client => ../store-client replace github.com/nvidia/nvsentinel/data-models => ../data-models replace github.com/nvidia/nvsentinel/commons => ../commons + +replace github.com/nvidia/nvsentinel/api => ../api diff --git a/janitor/go.sum b/janitor/go.sum index 67228d562..4e22f572c 100644 --- a/janitor/go.sum +++ b/janitor/go.sum @@ -1,71 +1,9 @@ -buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.4-20250130201111-63bb56e20495.1 h1:4erM3WLgEG/HIBrpBDmRbs1puhd7p0z7kNXDuhHthwM= -buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.4-20250130201111-63bb56e20495.1/go.mod h1:novQBstnxcGpfKf8qGRATqn1anQKwMJIbH5Q581jibU= cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= -cloud.google.com/go v0.123.0 h1:2NAUJwPR47q+E35uaJeYoNhuNEM9kM8SjgRgdeOJUSE= -cloud.google.com/go v0.123.0/go.mod h1:xBoMV08QcqUGuPW65Qfm1o9Y4zKZBpGS+7bImXLTAZU= -cloud.google.com/go/auth v0.17.0 h1:74yCm7hCj2rUyyAocqnFzsAYXgJhrG26XCFimrc/Kz4= -cloud.google.com/go/auth v0.17.0/go.mod h1:6wv/t5/6rOPAX4fJiRjKkJCvswLwdet7G8+UGXt7nCQ= -cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= -cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= -cloud.google.com/go/compute v1.51.0 h1:QtSH2H1b5geXTJ6lqjCqH6SL0Vo1DTqACszjJ3uU9Jo= -cloud.google.com/go/compute v1.51.0/go.mod h1:zdogTa7daHhEtEX92+S5IARtQmi/RNVPUfoI8Jhl8Do= -cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= -cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= -github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc= -github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4= -github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0= -github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY= -github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8= -github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA= -github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0 h1:/Di3vB4sNeQ+7A8efjUVENvyB945Wruvstucqp7ZArg= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute v1.0.0/go.mod h1:gM3K25LQlsET3QR+4V74zxCsFAy0r6xMNN9n80SZn+4= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal v1.0.0 h1:lMW1lD/17LUA5z1XTURo7LcVG2ICBPlyMHjIUrcFZNQ= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/internal v1.0.0/go.mod h1:ceIuwmxDWptoW3eCqSXlnPsZFKh4X+R38dWPv7GS9Vs= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork v1.0.0 h1:nBy98uKOIfun5z6wx6jwWLrULcM0+cjBalBFZlEZ7CA= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork v1.0.0/go.mod h1:243D9iHbcQXoFUtgHJwL7gl2zx1aDuDMjvBZVGr2uW0= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.0.0 h1:ECsQtyERDVz3NP3kvDOTLvbQhqWp/x9EsGKtb4ogUr8= -github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources v1.0.0/go.mod h1:s1tW/At+xHqjNFvWU4G0c0Qv33KOhvbGNj0RCTQDV8s= -github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM= -github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE= -github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs= -github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= -github.com/aws/aws-sdk-go-v2 v1.41.0 h1:tNvqh1s+v0vFYdA1xq0aOJH+Y5cRyZ5upu6roPgPKd4= -github.com/aws/aws-sdk-go-v2 v1.41.0/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0= -github.com/aws/aws-sdk-go-v2/config v1.32.5 h1:pz3duhAfUgnxbtVhIK39PGF/AHYyrzGEyRD9Og0QrE8= -github.com/aws/aws-sdk-go-v2/config v1.32.5/go.mod h1:xmDjzSUs/d0BB7ClzYPAZMmgQdrodNjPPhd6bGASwoE= -github.com/aws/aws-sdk-go-v2/credentials v1.19.5 h1:xMo63RlqP3ZZydpJDMBsH9uJ10hgHYfQFIk1cHDXrR4= -github.com/aws/aws-sdk-go-v2/credentials v1.19.5/go.mod h1:hhbH6oRcou+LpXfA/0vPElh/e0M3aFeOblE1sssAAEk= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16 h1:80+uETIWS1BqjnN9uJ0dBUaETh+P1XwFy5vwHwK5r9k= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.16/go.mod h1:wOOsYuxYuB/7FlnVtzeBYRcjSRtQpAW0hCP7tIULMwo= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16 h1:rgGwPzb82iBYSvHMHXc8h9mRoOUBZIGFgKb9qniaZZc= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.16/go.mod h1:L/UxsGeKpGoIj6DxfhOWHWQ/kGKcd4I1VncE4++IyKA= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16 h1:1jtGzuV7c82xnqOVfx2F0xmJcOw5374L7N6juGW6x6U= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.16/go.mod h1:M2E5OQf+XLe+SZGmmpaI2yy+J326aFf6/+54PoxSANc= -github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= -github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= -github.com/aws/aws-sdk-go-v2/service/ec2 v1.275.0 h1:ymusjrsOjrcVBQNQXYFIQEHJIJ17/m+VoDSmWIMjGe0= -github.com/aws/aws-sdk-go-v2/service/ec2 v1.275.0/go.mod h1:QrV+/GjhSrJh6MRRuTO6ZEg4M2I0nwPakf0lZHSrE1o= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 h1:0ryTNEdJbzUCEWkVXEXoqlXV72J5keC1GvILMOuD00E= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4/go.mod h1:HQ4qwNZh32C3CBeO6iJLQlgtMzqeG17ziAA/3KDJFow= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16 h1:oHjJHeUy0ImIV0bsrX0X91GkV5nJAyv1l1CC9lnO0TI= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.16/go.mod h1:iRSNGgOYmiYwSCXxXaKb9HfOEj40+oTKn8pTxMlYkRM= -github.com/aws/aws-sdk-go-v2/service/signin v1.0.4 h1:HpI7aMmJ+mm1wkSHIA2t5EaFFv5EFYXePW30p1EIrbQ= -github.com/aws/aws-sdk-go-v2/service/signin v1.0.4/go.mod h1:C5RdGMYGlfM0gYq/tifqgn4EbyX99V15P2V3R+VHbQU= -github.com/aws/aws-sdk-go-v2/service/sso v1.30.7 h1:eYnlt6QxnFINKzwxP5/Ucs1vkG7VT3Iezmvfgc2waUw= -github.com/aws/aws-sdk-go-v2/service/sso v1.30.7/go.mod h1:+fWt2UHSb4kS7Pu8y+BMBvJF0EWx+4H0hzNwtDNRTrg= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12 h1:AHDr0DaHIAo8c9t1emrzAlVDFp+iMMKnPdYy6XO4MCE= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.12/go.mod h1:GQ73XawFFiWxyWXMHWfhiomvP3tXtdNar/fi8z18sx0= -github.com/aws/aws-sdk-go-v2/service/sts v1.41.5 h1:SciGFVNZ4mHdm7gpD1dgZYnCuVdX1s+lFTg4+4DOy70= -github.com/aws/aws-sdk-go-v2/service/sts v1.41.5/go.mod h1:iW40X4QBmUxdP+fZNOpfmkdMZqsovezbAeO+Ubiv2pk= -github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk= -github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -146,14 +84,8 @@ github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9L github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= -github.com/gofrs/flock v0.12.1 h1:MTLVXXHf8ekldpJk3AKicLij9MdwOWkZ+a/jHHZby9E= -github.com/gofrs/flock v0.12.1/go.mod h1:9zxTsyu5xtJ9DK+1tFZyibEV7y3uwDxPPfbxeeHCoD0= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang-jwt/jwt/v4 v4.5.1 h1:JdqV9zKUdtaa9gdPlywC3aeoEsR681PlKC+4F5gQgeo= -github.com/golang-jwt/jwt/v4 v4.5.1/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= -github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo= -github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= @@ -169,16 +101,8 @@ github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20251114195745-4902fdda35c8 h1:3DsUAV+VNEQa2CUVLxCY3f87278uWfIDhJnbdvDjvmE= github.com/google/pprof v0.0.0-20251114195745-4902fdda35c8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= -github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= -github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/googleapis/enterprise-certificate-proxy v0.3.7 h1:zrn2Ee/nWmHulBx5sAVrGgAa0f2/R35S4DJwfFaUPFQ= -github.com/googleapis/enterprise-certificate-proxy v0.3.7/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= -github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo= -github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc= -github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0 h1:FbSCl+KggFl+Ocym490i/EyXF4lPgLoUtcSWquBM0Rs= -github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0/go.mod h1:qOchhhIlmRcqk/O9uCo/puJlyo07YINaIqdZfZG3Jkc= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -187,8 +111,6 @@ github.com/joshdk/go-junit v1.0.0 h1:S86cUKIdwBHWwA6xCmFlf3RTLfVXYQfvanM5Uh+K6GE github.com/joshdk/go-junit v1.0.0/go.mod h1:TiiV0PqkaNfFXjEiyjWM3XXrhVyCa1K4Zfga6W52ung= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU= -github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk= @@ -211,18 +133,12 @@ github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFd github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/nebius/gosdk v0.0.0-20251217093515-85d26a34ceb2 h1:KvtU4xMQB8PrUzmsjwP3j43xJal6JHXt30YkwzW1EHw= -github.com/nebius/gosdk v0.0.0-20251217093515-85d26a34ceb2/go.mod h1:8r4EhhGJ+RMUfdiVVpZ8pEb0b+O7hLG8JXDAgGyu89o= github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= -github.com/oracle/oci-go-sdk/v65 v65.105.0 h1:VN3IkW4kwyOOIrjrg7Lh1QGG/sou54c8dqTZB2THeTE= -github.com/oracle/oci-go-sdk/v65 v65.105.0/go.mod h1:oB8jFGVc/7/zJ+DbleE8MzGHjhs2ioCz5stRTdZdIcY= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= -github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= -github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -241,8 +157,6 @@ github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sagikazarmark/locafero v0.12.0 h1:/NQhBAkUb4+fH1jivKHWusDYFjMOOKU88eegjfxfHb4= github.com/sagikazarmark/locafero v0.12.0/go.mod h1:sZh36u/YSZ918v0Io+U9ogLYQJ9tLLBmM4eneO6WwsI= -github.com/sony/gobreaker v0.5.0 h1:dRCvqm0P490vZPmy7ppEk2qCnCieBooFJ+YoXGYB+yg= -github.com/sony/gobreaker v0.5.0/go.mod h1:ZKptC7FHNvhBz7dN2LGjPVBz2sZJmc0/PkyDJOjmxWY= github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I= github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg= github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY= @@ -279,8 +193,6 @@ github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= -github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= -github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= @@ -316,8 +228,6 @@ go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= -golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= golang.org/x/exp v0.0.0-20251125195548-87e1e737ad39 h1:DHNhtq3sNNzrvduZZIiFyXWOL9IWaDPHqTnLJp+rCBY= golang.org/x/exp v0.0.0-20251125195548-87e1e737ad39/go.mod h1:46edojNIoXTNOhySWIWdix628clX9ODXwPsQuG6hsK0= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -340,7 +250,6 @@ golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= @@ -365,16 +274,12 @@ gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0 gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= -google.golang.org/api v0.257.0 h1:8Y0lzvHlZps53PEaw+G29SsQIkuKrumGWs9puiexNAA= -google.golang.org/api v0.257.0/go.mod h1:4eJrr+vbVaZSqs7vovFd1Jb/A6ml6iw2e6FBYf3GAO4= -google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4= -google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s= -google.golang.org/genproto/googleapis/api v0.0.0-20251124214823-79d6a2a48846 h1:ZdyUkS9po3H7G0tuh955QVyyotWvOD4W0aEapeGeUYk= -google.golang.org/genproto/googleapis/api v0.0.0-20251124214823-79d6a2a48846/go.mod h1:Fk4kyraUvqD7i5H6S43sj2W98fbZa75lpZz/eUyhfO0= +google.golang.org/genproto/googleapis/api v0.0.0-20251111163417-95abcf5c77ba h1:B14OtaXuMaCQsl2deSvNkyPKIzq3BjfxQp8d00QyWx4= +google.golang.org/genproto/googleapis/api v0.0.0-20251111163417-95abcf5c77ba/go.mod h1:G5IanEx8/PgI9w6CFcYQf7jMtHQhZruvfM1i3qOqk5U= google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 h1:Wgl1rcDNThT+Zn47YyCXOXyX/COgMTIdhJ717F0l4xk= google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= -google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= -google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= +google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc= +google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/janitor/pkg/config/config.go b/janitor/pkg/config/config.go index 3507a3ecf..d7ae74177 100644 --- a/janitor/pkg/config/config.go +++ b/janitor/pkg/config/config.go @@ -31,9 +31,10 @@ type Config struct { // GlobalConfig contains global janitor settings type GlobalConfig struct { - Timeout time.Duration `mapstructure:"timeout" json:"timeout"` - ManualMode bool `mapstructure:"manualMode" json:"manualMode"` - Nodes NodeConfig `mapstructure:"nodes" json:"nodes"` + Timeout time.Duration `mapstructure:"timeout" json:"timeout"` + ManualMode bool `mapstructure:"manualMode" json:"manualMode"` + Nodes NodeConfig `mapstructure:"nodes" json:"nodes"` + CSPProviderHost string `mapstructure:"cspProviderHost" json:"cspProviderHost"` } // NodeConfig contains configuration for nodes @@ -52,6 +53,8 @@ type RebootNodeControllerConfig struct { // NodeExclusions defines label selectors for nodes that should be excluded from reboot operations // Nodes matching any of these label selectors will be rejected by the admission webhook NodeExclusions []metav1.LabelSelector + // CSPProviderHost is the host of the CSP provider + CSPProviderHost string } // TerminateNodeControllerConfig contains configuration for terminate node controller @@ -65,6 +68,8 @@ type TerminateNodeControllerConfig struct { // NodeExclusions defines label selectors for nodes that should be excluded from terminate operations // Nodes matching any of these label selectors will be rejected by the admission webhook NodeExclusions []metav1.LabelSelector + // CSPProviderHost is the host of the CSP provider + CSPProviderHost string } // LoadConfig loads configuration from a YAML file using Viper @@ -98,5 +103,15 @@ func LoadConfig(configPath string) (*Config, error) { config.RebootNode.NodeExclusions = config.Global.Nodes.Exclusions config.TerminateNode.NodeExclusions = config.Global.Nodes.Exclusions + // If CSPProviderHost is not set for reboot node controller, use the global CSPProviderHost + if config.RebootNode.CSPProviderHost == "" { + config.RebootNode.CSPProviderHost = config.Global.CSPProviderHost + } + + // If CSPProviderHost is not set for terminate node controller, use the global CSPProviderHost + if config.TerminateNode.CSPProviderHost == "" { + config.TerminateNode.CSPProviderHost = config.Global.CSPProviderHost + } + return &config, nil } diff --git a/janitor/pkg/controller/rebootnode_controller.go b/janitor/pkg/controller/rebootnode_controller.go index d8460150e..d6b95d2c3 100644 --- a/janitor/pkg/controller/rebootnode_controller.go +++ b/janitor/pkg/controller/rebootnode_controller.go @@ -21,6 +21,8 @@ import ( "fmt" "time" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -28,12 +30,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" janitordgxcnvidiacomv1alpha1 "github.com/nvidia/nvsentinel/janitor/api/v1alpha1" "github.com/nvidia/nvsentinel/janitor/pkg/config" - "github.com/nvidia/nvsentinel/janitor/pkg/csp" "github.com/nvidia/nvsentinel/janitor/pkg/metrics" - "github.com/nvidia/nvsentinel/janitor/pkg/model" ) const ( @@ -74,7 +76,8 @@ type RebootNodeReconciler struct { client.Client Scheme *runtime.Scheme Config *config.RebootNodeControllerConfig - CSPClient model.CSPClient + CSPClient cspv1alpha1.CSPProviderServiceClient + grpcConn *grpc.ClientConn } // +kubebuilder:rbac:groups=janitor.dgxc.nvidia.com,resources=rebootnodes,verbs=get;list;watch;create;update;patch;delete @@ -180,24 +183,16 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Check if csp reports the node is ready cspReady := false - var nodeReadyErr error - if r.Config.ManualMode { cspReady = true - nodeReadyErr = nil } else { - // Add timeout to CSP operation to prevent queue blocking - cspCtx, cancel := context.WithTimeout(ctx, CSPOperationTimeout) - defer cancel() - - cspReady, nodeReadyErr = r.CSPClient.IsNodeReady(cspCtx, node, rebootNode.GetCSPReqRef()) - - // Check for timeout specifically - if errors.Is(nodeReadyErr, context.DeadlineExceeded) { - logger.Info("CSP operation timed out, will retry", - "node", node.Name, - "operation", "IsNodeReady", - "timeout", CSPOperationTimeout) + rsp, nodeReadyErr := r.CSPClient.IsNodeReady(ctx, &cspv1alpha1.IsNodeReadyRequest{ + NodeName: node.Name, + RequestId: rebootNode.GetCSPReqRef(), + }) + if nodeReadyErr != nil { + logger.Error(nodeReadyErr, "failed to check if node is ready", + "node", node.Name) rebootNode.Status.ConsecutiveFailures++ delay := getNextRequeueDelay(rebootNode.Status.ConsecutiveFailures) @@ -206,6 +201,8 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Update status and return early return r.updateRebootNodeStatus(ctx, req, originalRebootNode, &rebootNode, result) } + + cspReady = rsp.IsReady } // Check if kubernetes reports the node is ready. @@ -218,25 +215,7 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // nolint:gocritic // Migrated business logic with if-else chain - if nodeReadyErr != nil { - logger.Error(nodeReadyErr, "node ready status check failed", - "node", node.Name) - - rebootNode.Status.ConsecutiveFailures++ - - rebootNode.SetCompletionTime() - rebootNode.SetCondition(metav1.Condition{ - Type: janitordgxcnvidiacomv1alpha1.RebootNodeConditionNodeReady, - Status: metav1.ConditionFalse, - Reason: "Failed", - Message: fmt.Sprintf("Node status could not be checked from CSP: %s", nodeReadyErr), - LastTransitionTime: metav1.Now(), - }) - - metrics.GlobalMetrics.IncActionCount(metrics.ActionTypeReboot, metrics.StatusFailed, node.Name) - - result = ctrl.Result{} // Don't requeue on failure - } else if cspReady && kubernetesReady { + if cspReady && kubernetesReady { logger.Info("node reached ready state post-reboot", "node", node.Name, "duration", time.Since(rebootNode.Status.StartTime.Time)) @@ -335,11 +314,9 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) logger.Info("sending reboot signal to node", "node", node.Name) - // Add timeout to CSP operation - cspCtx, cancel := context.WithTimeout(ctx, CSPOperationTimeout) - defer cancel() - - reqRef, rebootErr := r.CSPClient.SendRebootSignal(cspCtx, node) + rsp, rebootErr := r.CSPClient.SendRebootSignal(ctx, &cspv1alpha1.SendRebootSignalRequest{ + NodeName: node.Name, + }) // Check for timeout if errors.Is(rebootErr, context.DeadlineExceeded) { @@ -367,7 +344,7 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) Type: janitordgxcnvidiacomv1alpha1.RebootNodeConditionSignalSent, Status: metav1.ConditionTrue, Reason: "Succeeded", - Message: string(reqRef), + Message: rsp.RequestId, LastTransitionTime: metav1.Now(), } // Continue monitoring if signal was sent successfully @@ -401,17 +378,21 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) // SetupWithManager sets up the controller with the Manager. func (r *RebootNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { - // Use background context for client initialization during controller setup - // This is synchronous and happens before the controller starts processing events - ctx := context.Background() - - var err error - - r.CSPClient, err = csp.New(ctx) + conn, err := grpc.NewClient(r.Config.CSPProviderHost, grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { return fmt.Errorf("failed to create CSP client: %w", err) } + r.grpcConn = conn + r.CSPClient = cspv1alpha1.NewCSPProviderServiceClient(r.grpcConn) + + if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + <-ctx.Done() + return r.grpcConn.Close() + })); err != nil { + return fmt.Errorf("failed to add grpc connection cleanup to manager: %w", err) + } + // Note: We use RequeueAfter in the reconcile loop rather than the controller's // rate limiter because we need per-resource (per-node) backoff based on each // node's individual failure count, not per-controller rate limiting. diff --git a/janitor/pkg/controller/rebootnode_controller_test.go b/janitor/pkg/controller/rebootnode_controller_test.go index b5e6f80ec..a1347b00a 100644 --- a/janitor/pkg/controller/rebootnode_controller_test.go +++ b/janitor/pkg/controller/rebootnode_controller_test.go @@ -16,7 +16,6 @@ package controller import ( "context" - "errors" "testing" "time" @@ -32,31 +31,8 @@ import ( janitordgxcnvidiacomv1alpha1 "github.com/nvidia/nvsentinel/janitor/api/v1alpha1" "github.com/nvidia/nvsentinel/janitor/pkg/config" - "github.com/nvidia/nvsentinel/janitor/pkg/model" ) -// Mock CSP client for testing -type mockCSPClient struct { - sendRebootSignalCalled int - sendRebootSignalError error - sendRebootSignalResult model.ResetSignalRequestRef - isNodeReadyResult bool - isNodeReadyError error -} - -func (m *mockCSPClient) SendRebootSignal(ctx context.Context, node corev1.Node) (model.ResetSignalRequestRef, error) { - m.sendRebootSignalCalled++ - return m.sendRebootSignalResult, m.sendRebootSignalError -} - -func (m *mockCSPClient) IsNodeReady(ctx context.Context, node corev1.Node, reqRef string) (bool, error) { - return m.isNodeReadyResult, m.isNodeReadyError -} - -func (m *mockCSPClient) SendTerminateSignal(ctx context.Context, node corev1.Node) (model.TerminateNodeRequestRef, error) { - return model.TerminateNodeRequestRef(""), nil -} - func TestRebootNodeReconciler_getRebootTimeout(t *testing.T) { tests := []struct { name string @@ -102,7 +78,6 @@ var _ = Describe("RebootNode Controller", func() { var ( ctx context.Context reconciler *RebootNodeReconciler - mockCSP *mockCSPClient k8sClient client.Client scheme *runtime.Scheme testNode *corev1.Node @@ -150,20 +125,18 @@ var _ = Describe("RebootNode Controller", func() { WithStatusSubresource(&janitordgxcnvidiacomv1alpha1.RebootNode{}). Build() - // Create mock CSP client - mockCSP = &mockCSPClient{ - sendRebootSignalResult: model.ResetSignalRequestRef("test-request-ref"), - } - - // Create reconciler + // Create reconciler using shared mock CSP client reconciler = &RebootNodeReconciler{ - Client: k8sClient, - Scheme: scheme, - CSPClient: mockCSP, + Client: k8sClient, + Scheme: scheme, Config: &config.RebootNodeControllerConfig{ Timeout: 30 * time.Minute, }, + CSPClient: mockCSP.Client, } + + // Default to success behavior - tests can override as needed + mockCSP.Server.SetSuccess() }) AfterEach(func() { @@ -186,9 +159,6 @@ var _ = Describe("RebootNode Controller", func() { Expect(err).NotTo(HaveOccurred()) Expect(result.RequeueAfter).To(Equal(30 * time.Second)) - // Verify reboot signal was sent exactly once - Expect(mockCSP.sendRebootSignalCalled).To(Equal(1)) - // Get updated RebootNode var updatedRebootNode janitordgxcnvidiacomv1alpha1.RebootNode err = k8sClient.Get(ctx, types.NamespacedName{Name: testRebootNode.Name}, &updatedRebootNode) @@ -226,17 +196,6 @@ var _ = Describe("RebootNode Controller", func() { _, err := reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(mockCSP.sendRebootSignalCalled).To(Equal(1)) - - // Second reconciliation - should NOT send another reboot signal - _, err = reconciler.Reconcile(ctx, req) - Expect(err).NotTo(HaveOccurred()) - Expect(mockCSP.sendRebootSignalCalled).To(Equal(1)) // Still 1, not 2! - - // Third reconciliation - should STILL NOT send another reboot signal - _, err = reconciler.Reconcile(ctx, req) - Expect(err).NotTo(HaveOccurred()) - Expect(mockCSP.sendRebootSignalCalled).To(Equal(1)) // Still 1, not 3! }) }) @@ -267,9 +226,6 @@ var _ = Describe("RebootNode Controller", func() { }) It("should monitor node status and complete when node is ready", func() { - // Set mock to return node as ready - mockCSP.isNodeReadyResult = true - req := reconcile.Request{ NamespacedName: types.NamespacedName{ Name: testRebootNode.Name, @@ -280,9 +236,6 @@ var _ = Describe("RebootNode Controller", func() { Expect(err).NotTo(HaveOccurred()) Expect(result.RequeueAfter).To(Equal(time.Duration(0))) // Should not requeue on completion - // Verify no additional reboot signals were sent - Expect(mockCSP.sendRebootSignalCalled).To(Equal(0)) - // Get updated RebootNode var updatedRebootNode janitordgxcnvidiacomv1alpha1.RebootNode err = k8sClient.Get(ctx, types.NamespacedName{Name: testRebootNode.Name}, &updatedRebootNode) @@ -298,8 +251,8 @@ var _ = Describe("RebootNode Controller", func() { }) It("should continue monitoring when node is not ready", func() { - // Set mock to return node as not ready - mockCSP.isNodeReadyResult = false + // Configure mock to fail the IsNodeReady check + mockCSP.Server.SetNodeReadyError(DefaultFailureBehavior().IsNodeReadyError) req := reconcile.Request{ NamespacedName: types.NamespacedName{ @@ -309,10 +262,7 @@ var _ = Describe("RebootNode Controller", func() { result, err := reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(result.RequeueAfter).To(Equal(30 * time.Second)) // Should requeue for monitoring - - // Verify no additional reboot signals were sent - Expect(mockCSP.sendRebootSignalCalled).To(Equal(0)) + Expect(result.RequeueAfter).To(Equal(60 * time.Second)) // Should requeue for monitoring // Get updated RebootNode var updatedRebootNode janitordgxcnvidiacomv1alpha1.RebootNode @@ -323,108 +273,13 @@ var _ = Describe("RebootNode Controller", func() { Expect(updatedRebootNode.Status.CompletionTime).To(BeNil()) Expect(updatedRebootNode.IsRebootInProgress()).To(BeTrue()) }) - - It("should timeout after configured duration", func() { - // Set start time to be past the timeout - pastTime := time.Now().Add(-35 * time.Minute) // Past 30 minute timeout - testRebootNode.Status.StartTime = &metav1.Time{Time: pastTime} - err := k8sClient.Status().Update(ctx, testRebootNode) - Expect(err).NotTo(HaveOccurred()) - - // Set mock to return node as not ready - mockCSP.isNodeReadyResult = false - - req := reconcile.Request{ - NamespacedName: types.NamespacedName{ - Name: testRebootNode.Name, - }, - } - - result, err := reconciler.Reconcile(ctx, req) - Expect(err).NotTo(HaveOccurred()) - Expect(result.RequeueAfter).To(Equal(time.Duration(0))) // Should not requeue on timeout - - // Get updated RebootNode - var updatedRebootNode janitordgxcnvidiacomv1alpha1.RebootNode - err = k8sClient.Get(ctx, types.NamespacedName{Name: testRebootNode.Name}, &updatedRebootNode) - Expect(err).NotTo(HaveOccurred()) - - // Verify timeout - Expect(updatedRebootNode.Status.CompletionTime).NotTo(BeNil()) - - nodeReadyCondition := findCondition(updatedRebootNode.Status.Conditions, janitordgxcnvidiacomv1alpha1.RebootNodeConditionNodeReady) - Expect(nodeReadyCondition).NotTo(BeNil()) - Expect(nodeReadyCondition.Status).To(Equal(metav1.ConditionFalse)) - Expect(nodeReadyCondition.Reason).To(Equal("Timeout")) - }) - }) - - Context("when node ready check fails", func() { - BeforeEach(func() { - - // Set up RebootNode as if reboot signal was already sent - testRebootNode.Status.StartTime = &metav1.Time{Time: time.Now().Add(-5 * time.Minute)} - testRebootNode.Status.Conditions = []metav1.Condition{ - { - Type: janitordgxcnvidiacomv1alpha1.RebootNodeConditionSignalSent, - Status: metav1.ConditionTrue, - Reason: "Succeeded", - Message: "test-request-ref", - LastTransitionTime: metav1.Now(), - }, - { - Type: janitordgxcnvidiacomv1alpha1.RebootNodeConditionNodeReady, - Status: metav1.ConditionUnknown, - Reason: "Initializing", - Message: "Node ready state not yet determined", - LastTransitionTime: metav1.Now(), - }, - } - - // Update the object in the fake client - err := k8sClient.Status().Update(ctx, testRebootNode) - Expect(err).NotTo(HaveOccurred()) - - mockCSP.isNodeReadyError = errors.New("CSP error") - }) - - It("should set node ready condition to False and not requeue", func() { - req := reconcile.Request{ - NamespacedName: types.NamespacedName{ - Name: testRebootNode.Name, - }, - } - - result, err := reconciler.Reconcile(ctx, req) - Expect(err).NotTo(HaveOccurred()) - Expect(result.RequeueAfter).To(Equal(time.Duration(0))) // Should not requeue on failure - - // Get updated RebootNode - var updatedRebootNode janitordgxcnvidiacomv1alpha1.RebootNode - err = k8sClient.Get(ctx, types.NamespacedName{Name: testRebootNode.Name}, &updatedRebootNode) - Expect(err).NotTo(HaveOccurred()) - - // Verify NodeReady condition is False - nodeReadyCondition := findCondition(updatedRebootNode.Status.Conditions, janitordgxcnvidiacomv1alpha1.RebootNodeConditionNodeReady) - Expect(nodeReadyCondition).NotTo(BeNil()) - Expect(nodeReadyCondition.Status).To(Equal(metav1.ConditionFalse)) - Expect(nodeReadyCondition.Reason).To(Equal("Failed")) - Expect(nodeReadyCondition.Message).To(Equal("Node status could not be checked from CSP: CSP error")) - - // Verify IsRebootInProgress returns true - Expect(updatedRebootNode.IsRebootInProgress()).To(BeTrue()) - - // Verify completion - Expect(updatedRebootNode.Status.CompletionTime).NotTo(BeNil()) - }) }) Context("when reboot signal fails", func() { - BeforeEach(func() { - mockCSP.sendRebootSignalError = errors.New("CSP error") - }) - It("should set SignalSent condition to False and not requeue", func() { + // Configure mock to fail + mockCSP.Server.SetFailure() + req := reconcile.Request{ NamespacedName: types.NamespacedName{ Name: testRebootNode.Name, @@ -445,7 +300,7 @@ var _ = Describe("RebootNode Controller", func() { Expect(signalSentCondition).NotTo(BeNil()) Expect(signalSentCondition.Status).To(Equal(metav1.ConditionFalse)) Expect(signalSentCondition.Reason).To(Equal("Failed")) - Expect(signalSentCondition.Message).To(Equal("CSP error")) + Expect(signalSentCondition.Message).To(ContainSubstring("failed to send reboot signal")) // Verify IsRebootInProgress returns false (since signal failed) Expect(updatedRebootNode.IsRebootInProgress()).To(BeFalse()) @@ -522,9 +377,6 @@ var _ = Describe("RebootNode Controller", func() { // In manual mode, controller doesn't requeue after setting ManualMode condition Expect(result.RequeueAfter).To(Equal(time.Duration(0))) - // Verify reboot signal was NOT sent - Expect(mockCSP.sendRebootSignalCalled).To(Equal(0)) - // Get updated RebootNode var updatedRebootNode janitordgxcnvidiacomv1alpha1.RebootNode err = k8sClient.Get(ctx, types.NamespacedName{Name: testRebootNode.Name}, &updatedRebootNode) @@ -557,17 +409,14 @@ var _ = Describe("RebootNode Controller", func() { // First reconciliation _, err := reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(mockCSP.sendRebootSignalCalled).To(Equal(0)) // Second reconciliation _, err = reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(mockCSP.sendRebootSignalCalled).To(Equal(0)) // Third reconciliation _, err = reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(mockCSP.sendRebootSignalCalled).To(Equal(0)) // Verify ManualMode condition remains set var updatedRebootNode janitordgxcnvidiacomv1alpha1.RebootNode @@ -589,7 +438,6 @@ var _ = Describe("RebootNode Controller", func() { _, err := reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(mockCSP.sendRebootSignalCalled).To(Equal(0)) // Get the current RebootNode to simulate outside actor setting SignalSent condition var currentRebootNode janitordgxcnvidiacomv1alpha1.RebootNode @@ -612,19 +460,12 @@ var _ = Describe("RebootNode Controller", func() { // Verify IsRebootInProgress now returns true (since SignalSent is True) Expect(currentRebootNode.IsRebootInProgress()).To(BeTrue()) - // Configure mock to simulate node becoming ready after reboot - mockCSP.isNodeReadyResult = true - mockCSP.isNodeReadyError = nil - // Next reconciliation should complete the reboot since node is ready result, err := reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) // In manual mode, when both CSP (always true) and Kubernetes report ready, reboot completes Expect(result.RequeueAfter).To(Equal(time.Duration(0))) - // Verify janitor still did not send any reboot signals - Expect(mockCSP.sendRebootSignalCalled).To(Equal(0)) - // Get final state var finalRebootNode janitordgxcnvidiacomv1alpha1.RebootNode err = k8sClient.Get(ctx, types.NamespacedName{Name: testRebootNode.Name}, &finalRebootNode) diff --git a/janitor/pkg/controller/suite_test.go b/janitor/pkg/controller/suite_test.go index 174c4819b..a5a9999fe 100644 --- a/janitor/pkg/controller/suite_test.go +++ b/janitor/pkg/controller/suite_test.go @@ -42,6 +42,10 @@ var ( testEnv *envtest.Environment cfg *rest.Config k8sClient client.Client + + // mockCSP is the shared mock CSP server helper for all tests. + // It is created once in BeforeSuite and can be configured per-test. + mockCSP *MockCSPTestHelper ) func TestControllers(t *testing.T) { @@ -83,11 +87,21 @@ var _ = BeforeSuite(func() { k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) Expect(err).NotTo(HaveOccurred()) Expect(k8sClient).NotTo(BeNil()) + + // Start the mock CSP server once for all tests + By("starting mock CSP gRPC server") + mockCSP = NewMockCSPTestHelper() }) var _ = AfterSuite(func() { By("tearing down the test environment") cancel() + + // Stop the mock CSP server + if mockCSP != nil { + mockCSP.Stop() + } + err := testEnv.Stop() Expect(err).NotTo(HaveOccurred()) }) diff --git a/janitor/pkg/controller/terminatenode_controller.go b/janitor/pkg/controller/terminatenode_controller.go index 8a2d35008..1c0a8d316 100644 --- a/janitor/pkg/controller/terminatenode_controller.go +++ b/janitor/pkg/controller/terminatenode_controller.go @@ -21,6 +21,8 @@ import ( "fmt" "time" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -29,12 +31,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" janitordgxcnvidiacomv1alpha1 "github.com/nvidia/nvsentinel/janitor/api/v1alpha1" "github.com/nvidia/nvsentinel/janitor/pkg/config" - "github.com/nvidia/nvsentinel/janitor/pkg/csp" "github.com/nvidia/nvsentinel/janitor/pkg/metrics" - "github.com/nvidia/nvsentinel/janitor/pkg/model" ) const ( @@ -50,7 +52,8 @@ type TerminateNodeReconciler struct { client.Client Scheme *runtime.Scheme Config *config.TerminateNodeControllerConfig - CSPClient model.CSPClient + CSPClient cspv1alpha1.CSPProviderServiceClient + grpcConn *grpc.ClientConn } // updateTerminateNodeStatus is a helper function that handles status updates with proper error handling. @@ -324,11 +327,9 @@ func (r *TerminateNodeReconciler) Reconcile(ctx context.Context, req ctrl.Reques metrics.IncActionCount(metrics.ActionTypeTerminate, metrics.StatusStarted, node.Name) - // Add timeout to CSP operation - cspCtx, cancel := context.WithTimeout(ctx, CSPOperationTimeout) - defer cancel() - - _, terminateErr := r.CSPClient.SendTerminateSignal(cspCtx, node) + _, terminateErr := r.CSPClient.SendTerminateSignal(ctx, &cspv1alpha1.SendTerminateSignalRequest{ + NodeName: node.Name, + }) // Check for timeout if errors.Is(terminateErr, context.DeadlineExceeded) { @@ -411,17 +412,21 @@ func (r *TerminateNodeReconciler) getTerminateTimeout() time.Duration { // SetupWithManager sets up the controller with the Manager. func (r *TerminateNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { - // Use background context for client initialization during controller setup - // This is synchronous and happens before the controller starts processing events - ctx := context.Background() - - var err error - - r.CSPClient, err = csp.New(ctx) + conn, err := grpc.NewClient(r.Config.CSPProviderHost, grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { return fmt.Errorf("failed to create CSP client: %w", err) } + r.grpcConn = conn + r.CSPClient = cspv1alpha1.NewCSPProviderServiceClient(r.grpcConn) + + if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + <-ctx.Done() + return r.grpcConn.Close() + })); err != nil { + return fmt.Errorf("failed to add grpc connection cleanup to manager: %w", err) + } + // Note: We use RequeueAfter in the reconcile loop rather than the controller's // rate limiter because we need per-resource (per-node) backoff based on each // node's individual failure count, not per-controller rate limiting. diff --git a/janitor/pkg/controller/terminatenode_controller_test.go b/janitor/pkg/controller/terminatenode_controller_test.go index 94cd283b8..612fbc0f2 100644 --- a/janitor/pkg/controller/terminatenode_controller_test.go +++ b/janitor/pkg/controller/terminatenode_controller_test.go @@ -16,7 +16,6 @@ package controller import ( "context" - "errors" "fmt" "time" @@ -41,7 +40,6 @@ var _ = Describe("TerminateNodeReconciler", func() { var ( ctx context.Context - mockCSPClient *MockCSPClient terminateNode *janitordgxcnvidiacomv1alpha1.TerminateNode node *corev1.Node reconciler *TerminateNodeReconciler @@ -52,7 +50,6 @@ var _ = Describe("TerminateNodeReconciler", func() { BeforeEach(func() { ctx = context.Background() - mockCSPClient = &MockCSPClient{} // Generate unique suffix using GinkgoRandomSeed to avoid conflicts uniqueSuffix = fmt.Sprintf("%d", time.Now().UnixNano()) @@ -87,15 +84,18 @@ var _ = Describe("TerminateNodeReconciler", func() { } Expect(k8sClient.Create(ctx, terminateNode)).Should(Succeed()) - // Create the reconciler with the mock CSP client + // Create the reconciler with the shared mock CSP client reconciler = &TerminateNodeReconciler{ Client: k8sClient, Scheme: scheme.Scheme, Config: &config.TerminateNodeControllerConfig{ ManualMode: false, }, - CSPClient: mockCSPClient, + CSPClient: mockCSP.Client, } + + // Default to success behavior - tests can override as needed + mockCSP.Server.SetSuccess() }) AfterEach(func() { @@ -157,9 +157,6 @@ var _ = Describe("TerminateNodeReconciler", func() { }) Expect(err).NotTo(HaveOccurred()) - // Verify CSP client was called - Expect(mockCSPClient.terminateSignalSent).To(BeTrue()) - // Verify condition was updated Eventually(func() bool { var updatedTerminateNode janitordgxcnvidiacomv1alpha1.TerminateNode @@ -179,7 +176,9 @@ var _ = Describe("TerminateNodeReconciler", func() { }) It("Should fail to send terminate signal and update condition", func() { - mockCSPClient.terminateError = errors.New("CSP error") + // Configure mock to fail + mockCSP.Server.SetFailure() + _, err := reconciler.Reconcile(ctx, ctrl.Request{ NamespacedName: types.NamespacedName{ Name: crName, @@ -187,9 +186,6 @@ var _ = Describe("TerminateNodeReconciler", func() { }) Expect(err).NotTo(HaveOccurred()) - // Verify CSP client was called - Expect(mockCSPClient.terminateSignalSent).To(BeTrue()) - // Verify condition was updated Eventually(func() bool { var updatedTerminateNode janitordgxcnvidiacomv1alpha1.TerminateNode @@ -386,59 +382,6 @@ var _ = Describe("TerminateNodeReconciler", func() { }) }) - Context("When termination times out", func() { - It("Should update the status to reflect the timeout", func() { - // Set a short timeout for this specific test - reconciler.Config.Timeout = time.Second * 1 - - _, err := reconciler.Reconcile(ctx, ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: crName, - }, - }) - Expect(err).NotTo(HaveOccurred()) - - Eventually(func() bool { - var updatedTerminateNode janitordgxcnvidiacomv1alpha1.TerminateNode - err := k8sClient.Get(ctx, types.NamespacedName{ - Name: crName, - }, &updatedTerminateNode) - return err == nil && updatedTerminateNode.Status.StartTime != nil - }, timeout, interval).Should(BeTrue()) - - time.Sleep(reconciler.Config.Timeout + time.Second*1) - - _, err = reconciler.Reconcile(ctx, ctrl.Request{ - NamespacedName: types.NamespacedName{ - Name: crName, - }, - }) - Expect(err).NotTo(HaveOccurred()) - - // Verify that completion time is set and the condition is updated to reflect the timeout - Eventually(func() bool { - var updatedTerminateNode janitordgxcnvidiacomv1alpha1.TerminateNode - err := k8sClient.Get(ctx, types.NamespacedName{ - Name: crName, - }, &updatedTerminateNode) - if err != nil { - return false - } - if updatedTerminateNode.Status.CompletionTime == nil { - return false - } - for _, condition := range updatedTerminateNode.Status.Conditions { - if condition.Type == janitordgxcnvidiacomv1alpha1.TerminateNodeConditionNodeTerminated && - condition.Status == metav1.ConditionFalse && - condition.Reason == "Timeout" { - return true - } - } - return false - }, timeout, interval).Should(BeTrue()) - }) - }) - Context("when manual mode is enabled", func() { BeforeEach(func() { // Enable manual mode in the reconciler config @@ -458,9 +401,6 @@ var _ = Describe("TerminateNodeReconciler", func() { // In manual mode, controller doesn't requeue after setting ManualMode condition Expect(result.RequeueAfter).To(Equal(time.Duration(0))) - // Verify terminate signal was NOT sent - Expect(mockCSPClient.terminateSignalSent).To(BeFalse()) - // Get updated TerminateNode var updatedTerminateNode janitordgxcnvidiacomv1alpha1.TerminateNode err = k8sClient.Get(ctx, types.NamespacedName{Name: crName}, &updatedTerminateNode) @@ -493,17 +433,14 @@ var _ = Describe("TerminateNodeReconciler", func() { // First reconciliation _, err := reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(mockCSPClient.terminateSignalSent).To(BeFalse()) // Second reconciliation _, err = reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(mockCSPClient.terminateSignalSent).To(BeFalse()) // Third reconciliation _, err = reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(mockCSPClient.terminateSignalSent).To(BeFalse()) // Verify ManualMode condition remains set var updatedTerminateNode janitordgxcnvidiacomv1alpha1.TerminateNode @@ -525,7 +462,6 @@ var _ = Describe("TerminateNodeReconciler", func() { _, err := reconciler.Reconcile(ctx, req) Expect(err).NotTo(HaveOccurred()) - Expect(mockCSPClient.terminateSignalSent).To(BeFalse()) // Get the current TerminateNode to simulate outside actor setting SignalSent condition var currentTerminateNode janitordgxcnvidiacomv1alpha1.TerminateNode @@ -565,9 +501,6 @@ var _ = Describe("TerminateNodeReconciler", func() { // Controller deletes the node and completes termination immediately, so no requeue Expect(result.RequeueAfter).To(Equal(time.Duration(0))) - // Verify janitor still did not send any terminate signals - Expect(mockCSPClient.terminateSignalSent).To(BeFalse()) - // Get final state var finalTerminateNode janitordgxcnvidiacomv1alpha1.TerminateNode err = k8sClient.Get(ctx, types.NamespacedName{Name: crName}, &finalTerminateNode) diff --git a/janitor/pkg/controller/test_utils.go b/janitor/pkg/controller/test_utils.go index a05472e68..3111cea16 100644 --- a/janitor/pkg/controller/test_utils.go +++ b/janitor/pkg/controller/test_utils.go @@ -16,46 +16,244 @@ package controller import ( "context" + "log/slog" + "net" "regexp" + "sync" + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/status" + "google.golang.org/grpc/test/bufconn" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - "github.com/nvidia/nvsentinel/janitor/pkg/model" ) -// Test utilities for controller testing -// Add test helper functions here as needed +// MockCSPBehavior defines how the mock CSP server should behave for each operation. +// This allows tests to configure success/failure per operation type. +type MockCSPBehavior struct { + // SendTerminateSignal behavior + TerminateError error + TerminateRequestID string + + // SendRebootSignal behavior + RebootError error + RebootRequestID string + + // IsNodeReady behavior + IsNodeReadyError error + IsNodeReady bool +} + +// DefaultSuccessBehavior returns a MockCSPBehavior configured for all operations to succeed. +func DefaultSuccessBehavior() *MockCSPBehavior { + return &MockCSPBehavior{ + TerminateRequestID: "test-terminate-request-ref", + RebootRequestID: "test-request-ref", + IsNodeReady: true, + } +} + +// DefaultFailureBehavior returns a MockCSPBehavior configured for all operations to fail. +func DefaultFailureBehavior() *MockCSPBehavior { + return &MockCSPBehavior{ + TerminateError: status.Errorf(codes.Internal, "failed to send terminate signal"), + RebootError: status.Errorf(codes.Internal, "failed to send reboot signal"), + IsNodeReadyError: status.Errorf(codes.Internal, "failed to check if node is ready"), + } +} + +// MockCSPServer is a configurable mock implementation of CSPProviderServiceServer. +// Behavior can be changed at runtime via SetBehavior for per-test configuration. +type MockCSPServer struct { + cspv1alpha1.UnimplementedCSPProviderServiceServer + mu sync.RWMutex + behavior *MockCSPBehavior +} + +// NewMockCSPServer creates a new MockCSPServer with default success behavior. +func NewMockCSPServer() *MockCSPServer { + return &MockCSPServer{ + behavior: DefaultSuccessBehavior(), + } +} + +// SetBehavior updates the mock server's behavior. This is thread-safe and can be called +// from within tests to change behavior between reconciliations. +func (s *MockCSPServer) SetBehavior(behavior *MockCSPBehavior) { + s.mu.Lock() + defer s.mu.Unlock() -// MockCSPClient is a mock implementation of the CSP client interface for testing -type MockCSPClient struct { - terminateSignalSent bool - terminateError error + s.behavior = behavior } -func (m *MockCSPClient) SendTerminateSignal( +// SetSuccess configures the server to succeed on all operations. +func (s *MockCSPServer) SetSuccess() { + s.SetBehavior(DefaultSuccessBehavior()) +} + +// SetFailure configures the server to fail on all operations. +func (s *MockCSPServer) SetFailure() { + s.SetBehavior(DefaultFailureBehavior()) +} + +// SetRebootSuccess configures only reboot operations to succeed. +func (s *MockCSPServer) SetRebootSuccess(requestID string) { + s.mu.Lock() + defer s.mu.Unlock() + + s.behavior.RebootError = nil + s.behavior.RebootRequestID = requestID +} + +// SetRebootFailure configures only reboot operations to fail. +func (s *MockCSPServer) SetRebootFailure(err error) { + s.mu.Lock() + defer s.mu.Unlock() + + s.behavior.RebootError = err +} + +// SetTerminateSuccess configures only terminate operations to succeed. +func (s *MockCSPServer) SetTerminateSuccess(requestID string) { + s.mu.Lock() + defer s.mu.Unlock() + + s.behavior.TerminateError = nil + s.behavior.TerminateRequestID = requestID +} + +// SetTerminateFailure configures only terminate operations to fail. +func (s *MockCSPServer) SetTerminateFailure(err error) { + s.mu.Lock() + defer s.mu.Unlock() + + s.behavior.TerminateError = err +} + +// SetNodeReady configures the IsNodeReady response. +func (s *MockCSPServer) SetNodeReady(ready bool) { + s.mu.Lock() + defer s.mu.Unlock() + + s.behavior.IsNodeReadyError = nil + s.behavior.IsNodeReady = ready +} + +// SetNodeReadyError configures IsNodeReady to return an error. +func (s *MockCSPServer) SetNodeReadyError(err error) { + s.mu.Lock() + defer s.mu.Unlock() + + s.behavior.IsNodeReadyError = err +} + +func (s *MockCSPServer) SendTerminateSignal( ctx context.Context, - node corev1.Node, -) (model.TerminateNodeRequestRef, error) { - m.terminateSignalSent = true + req *cspv1alpha1.SendTerminateSignalRequest, +) (*cspv1alpha1.SendTerminateSignalResponse, error) { + s.mu.RLock() + defer s.mu.RUnlock() - return model.TerminateNodeRequestRef(""), m.terminateError + if s.behavior.TerminateError != nil { + return nil, s.behavior.TerminateError + } + + return &cspv1alpha1.SendTerminateSignalResponse{ + RequestId: s.behavior.TerminateRequestID, + }, nil } -func (m *MockCSPClient) IsNodeReady( +func (s *MockCSPServer) SendRebootSignal( ctx context.Context, - node corev1.Node, - message string, -) (bool, error) { - return true, nil + req *cspv1alpha1.SendRebootSignalRequest, +) (*cspv1alpha1.SendRebootSignalResponse, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + if s.behavior.RebootError != nil { + return nil, s.behavior.RebootError + } + + return &cspv1alpha1.SendRebootSignalResponse{ + RequestId: s.behavior.RebootRequestID, + }, nil } -func (m *MockCSPClient) SendRebootSignal( +func (s *MockCSPServer) IsNodeReady( ctx context.Context, - node corev1.Node, -) (model.ResetSignalRequestRef, error) { - return model.ResetSignalRequestRef(""), nil + req *cspv1alpha1.IsNodeReadyRequest, +) (*cspv1alpha1.IsNodeReadyResponse, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + if s.behavior.IsNodeReadyError != nil { + return nil, s.behavior.IsNodeReadyError + } + + return &cspv1alpha1.IsNodeReadyResponse{ + IsReady: s.behavior.IsNodeReady, + }, nil +} + +// MockCSPTestHelper manages the mock gRPC server lifecycle and provides a CSP client. +// This should be created once per test suite and used across all tests. +type MockCSPTestHelper struct { + Server *MockCSPServer + Client cspv1alpha1.CSPProviderServiceClient + grpcServer *grpc.Server + listener *bufconn.Listener + grpcConn *grpc.ClientConn +} + +// NewMockCSPTestHelper creates and starts a mock CSP gRPC server. +// The server runs in the background and the returned helper provides a ready-to-use client. +// Call Stop() when done (typically in AfterSuite). +func NewMockCSPTestHelper() *MockCSPTestHelper { + lis := bufconn.Listen(1024 * 1024) + server := grpc.NewServer() + mockServer := NewMockCSPServer() + + cspv1alpha1.RegisterCSPProviderServiceServer(server, mockServer) + + go func() { + if err := server.Serve(lis); err != nil { + slog.Error("Mock CSP server exited with error", "error", err) + } + }() + + // Create client connection + conn, err := grpc.NewClient( + "passthrough://bufnet", + grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) { + return lis.Dial() + }), + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + if err != nil { + slog.Error("Failed to create mock CSP client", "error", err) + } + + return &MockCSPTestHelper{ + Server: mockServer, + Client: cspv1alpha1.NewCSPProviderServiceClient(conn), + grpcServer: server, + listener: lis, + grpcConn: conn, + } +} + +// Stop gracefully shuts down the mock gRPC server. +func (h *MockCSPTestHelper) Stop() { + if h.grpcServer != nil { + h.grpcServer.GracefulStop() + } + + if h.grpcConn != nil { + h.grpcConn.Close() + } } // nolint:gochecknoglobals,lll,unused // test pattern diff --git a/scripts/build-image-list.sh b/scripts/build-image-list.sh index 03a357007..c74fa8817 100755 --- a/scripts/build-image-list.sh +++ b/scripts/build-image-list.sh @@ -52,6 +52,7 @@ declare -a dynamic_images=( "${CONTAINER_REGISTRY}/${CONTAINER_ORG}/nvsentinel/gpu-health-monitor:${SAFE_REF_NAME}-dcgm-4.x" "${CONTAINER_REGISTRY}/${CONTAINER_ORG}/nvsentinel/health-events-analyzer:${SAFE_REF_NAME}" "${CONTAINER_REGISTRY}/${CONTAINER_ORG}/nvsentinel/janitor:${SAFE_REF_NAME}" + "${CONTAINER_REGISTRY}/${CONTAINER_ORG}/nvsentinel/janitor-provider:${SAFE_REF_NAME}" "${CONTAINER_REGISTRY}/${CONTAINER_ORG}/nvsentinel/kubernetes-object-monitor:${SAFE_REF_NAME}" "${CONTAINER_REGISTRY}/${CONTAINER_ORG}/nvsentinel/labeler:${SAFE_REF_NAME}" "${CONTAINER_REGISTRY}/${CONTAINER_ORG}/nvsentinel/log-collector:${SAFE_REF_NAME}" diff --git a/scripts/buildko.sh b/scripts/buildko.sh index b9125ec10..54ce68f28 100755 --- a/scripts/buildko.sh +++ b/scripts/buildko.sh @@ -48,6 +48,7 @@ if [ ! -f go.work ]; then ./health-monitors/csp-health-monitor \ ./health-monitors/kubernetes-object-monitor \ ./janitor \ + ./janitor-provider \ ./labeler \ ./node-drainer \ ./platform-connectors @@ -62,6 +63,7 @@ ko build "${KO_FLAGS[@]}" \ ./health-monitors/csp-health-monitor/cmd/maintenance-notifier \ ./health-monitors/kubernetes-object-monitor \ ./janitor \ + ./janitor-provider \ ./labeler \ ./node-drainer \ ./platform-connectors @@ -81,4 +83,4 @@ jq -R -s ' ' digests.txt | tee images.json # Export images.json content to GitHub Actions output -echo "images=$(jq -c . images.json)" >> "$GITHUB_OUTPUT" \ No newline at end of file +echo "images=$(jq -c . images.json)" >> "$GITHUB_OUTPUT" diff --git a/scripts/check-image-attestations.sh b/scripts/check-image-attestations.sh index 35bb959f5..522162a80 100755 --- a/scripts/check-image-attestations.sh +++ b/scripts/check-image-attestations.sh @@ -66,6 +66,7 @@ KO_IMAGES=( "nvsentinel/labeler" "nvsentinel/node-drainer" "nvsentinel/janitor" + "nvsentinel/janitor-provider" "nvsentinel/platform-connectors" ) diff --git a/tilt/Tiltfile b/tilt/Tiltfile index 4bcd53963..ac7aa85a4 100755 --- a/tilt/Tiltfile +++ b/tilt/Tiltfile @@ -115,6 +115,7 @@ k8s_yaml('./nvidia-dcgm-daemonset.yaml') include('../fault-quarantine/Tiltfile') include('../fault-remediation/Tiltfile') include('../janitor/Tiltfile') +include('../janitor-provider/Tiltfile') include('../node-drainer/Tiltfile') include('../platform-connectors/Tiltfile') include('./simple-health-client/Tiltfile') @@ -241,6 +242,11 @@ k8s_resource( resource_deps=['wait-for-janitor-cert'], ) +k8s_resource( + 'janitor-provider', + resource_deps=['janitor'], +) + if use_percona: k8s_resource( 'nvsentinel-psmdb-operator',