Skip to content

Commit

Permalink
remaining updates
Browse files Browse the repository at this point in the history
  • Loading branch information
jmdeal committed Feb 10, 2025
1 parent ec25007 commit f4ba729
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 22 deletions.
2 changes: 1 addition & 1 deletion kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.17.1
controller-gen.kubebuilder.io/version: v0.17.2
name: kwoknodeclasses.karpenter.kwok.sh
spec:
group: karpenter.kwok.sh
Expand Down
2 changes: 1 addition & 1 deletion kwok/charts/crds/karpenter.sh_nodeclaims.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.17.1
controller-gen.kubebuilder.io/version: v0.17.2
name: nodeclaims.karpenter.sh
spec:
group: karpenter.sh
Expand Down
2 changes: 1 addition & 1 deletion kwok/charts/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.17.1
controller-gen.kubebuilder.io/version: v0.17.2
name: nodepools.karpenter.sh
spec:
group: karpenter.sh
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/crds/karpenter.sh_nodeclaims.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.17.1
controller-gen.kubebuilder.io/version: v0.17.2
name: nodeclaims.karpenter.sh
spec:
group: karpenter.sh
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.17.1
controller-gen.kubebuilder.io/version: v0.17.2
name: nodepools.karpenter.sh
spec:
group: karpenter.sh
Expand Down
35 changes: 24 additions & 11 deletions pkg/controllers/node/termination/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,22 +101,22 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile
// the nodeclaim does not exist, this indicates a customer induced error (e.g. removing finalizers or manually
// creating nodeclaims with matching provider IDs).
if nodeutils.IsDuplicateNodeClaimError(err) || nodeutils.IsNodeClaimNotFoundError(err) {
return reconcile.Result{}, reconcile.TerminalError(fmt.Errorf("failed to terminate node, expected a single associated nodeclaim, %w", err))
return reconcile.Result{}, c.associatedNodeClaimError(err)
}
return reconcile.Result{}, err
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KRef(nodeClaim.Namespace, nodeClaim.Name)))
if nodeClaim.DeletionTimestamp.IsZero() {
if err := c.kubeClient.Delete(ctx, nodeClaim); err != nil {
if errors.IsNotFound(err) {
return reconcile.Result{}, reconcile.TerminalError(fmt.Errorf("failed to terminate node, expected a single associated nodeclaim, %w", err))
return reconcile.Result{}, c.associatedNodeClaimError(err)
}
return reconcile.Result{}, fmt.Errorf("deleting nodeclaim, %w", err)
}
}

// If the underlying NodeClaim no longer exists, we want to delete to avoid trying to gracefully drain nodes that are
// no longer alive. We do a check on the Ready condition of the node since, even though the CloudProvider says the
// If the underlying instance no longer exists, we want to delete to avoid trying to gracefully draining the
// associated node. We do a check on the Ready condition of the node since, even though the CloudProvider says the
// instance is not around, we know that the kubelet process is still running if the Node Ready condition is true.
// Similar logic to: https://github.com/kubernetes/kubernetes/blob/3a75a8c8d9e6a1ebd98d8572132e675d4980f184/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L144
if nodeutils.GetCondition(node, corev1.NodeReady).Status != corev1.ConditionTrue {
Expand Down Expand Up @@ -144,13 +144,13 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile
}
c.recorder.Publish(terminatorevents.NodeFailedToDrain(node, err))
stored := nodeClaim.DeepCopy()
if modified := nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeDrained, "Draining", "Draining"); modified {
if modified := nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeDrained, "Draining", "Draining"); modified {
if err := c.kubeClient.Status().Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil {
if errors.IsConflict(err) {
return reconcile.Result{Requeue: true}, nil
}
if errors.IsNotFound(err) {
return reconcile.Result{}, reconcile.TerminalError(fmt.Errorf("failed to terminate node, expected a single associated nodeclaim, %w", err))
return reconcile.Result{}, c.associatedNodeClaimError(err)
}
return reconcile.Result{}, err
}
Expand All @@ -165,7 +165,7 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile
return reconcile.Result{Requeue: true}, nil
}
if errors.IsNotFound(err) {
return reconcile.Result{}, reconcile.TerminalError(fmt.Errorf("failed to terminate node, expected a single associated nodeclaim, %w", err))
return reconcile.Result{}, c.associatedNodeClaimError(err)
}
return reconcile.Result{}, err
}
Expand All @@ -185,14 +185,16 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile
return reconcile.Result{}, fmt.Errorf("ensuring no volume attachments, %w", err)
}
if len(pendingVolumeAttachments) == 0 {
// There are no remaining volume attachments blocking instance termination. If we've already updated the status
// condition, fall through. Otherwise, update the status condition and requeue.
stored := nodeClaim.DeepCopy()
if modified := nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeVolumesDetached); modified {
if err := c.kubeClient.Status().Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil {
if errors.IsConflict(err) {
return reconcile.Result{Requeue: true}, nil
}
if errors.IsNotFound(err) {
return reconcile.Result{}, reconcile.TerminalError(fmt.Errorf("failed to terminate node, expected a single associated nodeclaim, %w", err))
return reconcile.Result{}, c.associatedNodeClaimError(err)
}
return reconcile.Result{}, err
}
Expand All @@ -201,29 +203,36 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile
return reconcile.Result{RequeueAfter: 1 * time.Second}, nil
}
} else if !c.hasTerminationGracePeriodElapsed(nodeTerminationTime) {
// There are volume attachments blocking instance termination remaining. We should set the status condition to
// unknown (if not already) and requeue. This case should never fall through, to continue to instance termination
// one of two conditions must be met: all blocking volume attachment objects must be deleted or the nodeclaim's TGP
// must have expired.
c.recorder.Publish(terminatorevents.NodeAwaitingVolumeDetachmentEvent(node, pendingVolumeAttachments...))
stored := nodeClaim.DeepCopy()
if modified := nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeVolumesDetached, "AwaitingVolumeDetachment", "AwaitingVolumeDetachment"); modified {
if modified := nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeVolumesDetached, "AwaitingVolumeDetachment", "AwaitingVolumeDetachment"); modified {
if err := c.kubeClient.Status().Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil {
if errors.IsConflict(err) {
return reconcile.Result{Requeue: true}, nil
}
if errors.IsNotFound(err) {
return reconcile.Result{}, reconcile.TerminalError(fmt.Errorf("failed to terminate node, expected a single associated nodeclaim, %w", err))
return reconcile.Result{}, c.associatedNodeClaimError(err)
}
return reconcile.Result{}, err
}
}
return reconcile.Result{RequeueAfter: 1 * time.Second}, nil
} else {
// There are volume attachments blocking instance termination remaining, but the nodeclaim's TGP has expired. In this
// case we should set the status condition to false (requeing if it wasn't already) and then fall through to instance
// termination.
stored := nodeClaim.DeepCopy()
if modified := nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeVolumesDetached, "TerminationGracePeriodElapsed", "TerminationGracePeriodElapsed"); modified {
if err := c.kubeClient.Status().Patch(ctx, nodeClaim, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); err != nil {
if errors.IsConflict(err) {
return reconcile.Result{Requeue: true}, nil
}
if errors.IsNotFound(err) {
return reconcile.Result{}, reconcile.TerminalError(fmt.Errorf("failed to terminate node, expected a single associated nodeclaim, %w", err))
return reconcile.Result{}, c.associatedNodeClaimError(err)
}
return reconcile.Result{}, err
}
Expand All @@ -250,6 +259,10 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile
return reconcile.Result{}, nil
}

func (*Controller) associatedNodeClaimError(err error) error {
return reconcile.TerminalError(fmt.Errorf("failed to terminate node, expected a single associated nodeclaim, %w", err))
}

func (c *Controller) hasTerminationGracePeriodElapsed(nodeTerminationTime *time.Time) bool {
if nodeTerminationTime == nil {
return false
Expand Down
11 changes: 6 additions & 5 deletions pkg/controllers/node/termination/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ var _ = Describe("Termination", func() {
ExpectSingletonReconciled(ctx, queue)

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).Reason).To(Equal("Draining"))

// Expect the pods to be evicted
Expand All @@ -560,7 +560,7 @@ var _ = Describe("Termination", func() {
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // DrainValidation
ExpectNodeWithNodeClaimDraining(env.Client, node.Name)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsFalse()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).IsUnknown()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrained).Reason).To(Equal("Draining"))

ExpectDeleted(ctx, env.Client, pods[1])
Expand Down Expand Up @@ -795,7 +795,7 @@ var _ = Describe("Termination", func() {
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // VolumeDetachmentInitiation
ExpectExists(ctx, env.Client, node)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).Reason).To(Equal("AwaitingVolumeDetachment"))

ExpectDeleted(ctx, env.Client, va)
Expand Down Expand Up @@ -837,7 +837,7 @@ var _ = Describe("Termination", func() {
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // VolumeDetachment
ExpectExists(ctx, env.Client, node)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).Reason).To(Equal("AwaitingVolumeDetachment"))

ExpectDeleted(ctx, env.Client, vaDrainable)
Expand Down Expand Up @@ -865,7 +865,8 @@ var _ = Describe("Termination", func() {
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // VolumeDetachment
ExpectExists(ctx, env.Client, node)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsFalse()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).IsUnknown()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeVolumesDetached).Reason).To(Equal("AwaitingVolumeDetachment"))

fakeClock.Step(5 * time.Minute)
ExpectRequeued(ExpectObjectReconciled(ctx, env.Client, terminationController, node)) // VolumeDetachment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.17.1
controller-gen.kubebuilder.io/version: v0.17.2
name: testnodeclasses.karpenter.test.sh
spec:
group: karpenter.test.sh
Expand Down

0 comments on commit f4ba729

Please sign in to comment.