Skip to content

Commit c06e7f1

Browse files
authored
Add enable-out-of-service-taint flag (#1132)
* Add enable-out-of-service-taint flag
1 parent e07cd85 commit c06e7f1

File tree

10 files changed

+110
-9
lines changed

10 files changed

+110
-9
lines changed

config/helm/aws-node-termination-handler/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ The configuration in this table applies to all AWS Node Termination Handler mode
9595
| `webhookTemplateConfigMapName` | Pass the webhook template file as a configmap. | "``" |
9696
| `webhookTemplateConfigMapKey` | Name of the Configmap key storing the template file. | `""` |
9797
| `enableSqsTerminationDraining` | If `true`, this turns on queue-processor mode which drains nodes when an SQS termination event is received. | `false` |
98+
| `enableOutOfServiceTaint` | If `true`, this will add out-of-service taint to node after cordon/drain process which would forcefully evict pods without matching tolerations and detach persistent volumes. | `false` |
9899

99100
### Queue-Processor Mode Configuration
100101

config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ spec:
9999
value: {{ .Values.cordonOnly | quote }}
100100
- name: TAINT_NODE
101101
value: {{ .Values.taintNode | quote }}
102+
- name: ENABLE_OUT_OF_SERVICE_TAINT
103+
value: {{ .Values.enableOutOfServiceTaint | quote }}
102104
- name: EXCLUDE_FROM_LOAD_BALANCERS
103105
value: {{ .Values.excludeFromLoadBalancers | quote }}
104106
- name: DELETE_LOCAL_DATA

config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ spec:
9999
value: {{ .Values.cordonOnly | quote }}
100100
- name: TAINT_NODE
101101
value: {{ .Values.taintNode | quote }}
102+
- name: ENABLE_OUT_OF_SERVICE_TAINT
103+
value: {{ .Values.enableOutOfServiceTaint | quote }}
102104
- name: EXCLUDE_FROM_LOAD_BALANCERS
103105
value: {{ .Values.excludeFromLoadBalancers | quote }}
104106
- name: DELETE_LOCAL_DATA

config/helm/aws-node-termination-handler/templates/deployment.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ spec:
102102
value: {{ .Values.cordonOnly | quote }}
103103
- name: TAINT_NODE
104104
value: {{ .Values.taintNode | quote }}
105+
- name: ENABLE_OUT_OF_SERVICE_TAINT
106+
value: {{ .Values.enableOutOfServiceTaint | quote }}
105107
- name: EXCLUDE_FROM_LOAD_BALANCERS
106108
value: {{ .Values.excludeFromLoadBalancers | quote }}
107109
- name: DELETE_LOCAL_DATA

config/helm/aws-node-termination-handler/values.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,9 @@ cordonOnly: false
8686
# Taint node upon spot interruption termination notice.
8787
taintNode: false
8888

89+
# Add out-of-service taint to node after cordon/drain process which would forcefully evict pods without matching tolerations and detach persistent volumes.
90+
enableOutOfServiceTaint: false
91+
8992
# Exclude node from load balancer before cordoning via the ServiceNodeExclusion feature gate.
9093
excludeFromLoadBalancers: false
9194

pkg/config/config.go

+7
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ const (
7777
taintNode = "TAINT_NODE"
7878
taintEffectDefault = "NoSchedule"
7979
taintEffect = "TAINT_EFFECT"
80+
enableOutOfServiceTaintConfigKey = "ENABLE_OUT_OF_SERVICE_TAINT"
81+
enableOutOfServiceTaintDefault = false
8082
excludeFromLoadBalancers = "EXCLUDE_FROM_LOAD_BALANCERS"
8183
jsonLoggingConfigKey = "JSON_LOGGING"
8284
jsonLoggingDefault = false
@@ -149,6 +151,7 @@ type Config struct {
149151
CordonOnly bool
150152
TaintNode bool
151153
TaintEffect string
154+
EnableOutOfServiceTaint bool
152155
ExcludeFromLoadBalancers bool
153156
JsonLogging bool
154157
LogLevel string
@@ -215,6 +218,7 @@ func ParseCliArgs() (config Config, err error) {
215218
flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.")
216219
flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.")
217220
flag.StringVar(&config.TaintEffect, "taint-effect", getEnv(taintEffect, taintEffectDefault), "Sets the effect when a node is tainted.")
221+
flag.BoolVar(&config.EnableOutOfServiceTaint, "enable-out-of-service-taint", getBoolEnv(enableOutOfServiceTaintConfigKey, enableOutOfServiceTaintDefault), "If true, nodes will be tainted as out-of-service after we cordon/drain the nodes when an interruption event occurs.")
218222
flag.BoolVar(&config.ExcludeFromLoadBalancers, "exclude-from-load-balancers", getBoolEnv(excludeFromLoadBalancers, false), "If true, nodes will be marked for exclusion from load balancers when an interruption event occurs.")
219223
flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.")
220224
flag.StringVar(&config.LogLevel, "log-level", getEnv(logLevelConfigKey, logLevelDefault), "Sets the log level (INFO, DEBUG, or ERROR)")
@@ -344,6 +348,7 @@ func (c Config) PrintJsonConfigArgs() {
344348
Bool("cordon_only", c.CordonOnly).
345349
Bool("taint_node", c.TaintNode).
346350
Str("taint_effect", c.TaintEffect).
351+
Bool("enable_out_of_service_taint", c.EnableOutOfServiceTaint).
347352
Bool("exclude_from_load_balancers", c.ExcludeFromLoadBalancers).
348353
Bool("json_logging", c.JsonLogging).
349354
Str("log_level", c.LogLevel).
@@ -395,6 +400,7 @@ func (c Config) PrintHumanConfigArgs() {
395400
"\tcordon-only: %t,\n"+
396401
"\ttaint-node: %t,\n"+
397402
"\ttaint-effect: %s,\n"+
403+
"\tenable-out-of-service-taint: %t,\n"+
398404
"\texclude-from-load-balancers: %t,\n"+
399405
"\tjson-logging: %t,\n"+
400406
"\tlog-level: %s,\n"+
@@ -437,6 +443,7 @@ func (c Config) PrintHumanConfigArgs() {
437443
c.CordonOnly,
438444
c.TaintNode,
439445
c.TaintEffect,
446+
c.EnableOutOfServiceTaint,
440447
c.ExcludeFromLoadBalancers,
441448
c.JsonLogging,
442449
c.LogLevel,

pkg/interruptionevent/draincordon/handler.go

+9
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,15 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error {
119119
if (err == nil || (!nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound)) && drainEvent.PostDrainTask != nil {
120120
h.commonHandler.RunPostDrainTask(nodeName, drainEvent)
121121
}
122+
123+
// Only add out-of-service taint if ENABLE_OUT_OF_SERVICE_TAINT flag is true, and CORDON_ONLY flag is false
124+
if err == nil && h.commonHandler.NthConfig.EnableOutOfServiceTaint && !h.commonHandler.NthConfig.CordonOnly {
125+
err = h.commonHandler.Node.TaintOutOfService(nodeName)
126+
if err != nil {
127+
return fmt.Errorf("cannot add out-of-service taint on node %s: %w", nodeName, err)
128+
}
129+
}
130+
122131
return nil
123132
}
124133

pkg/node/node.go

+24-6
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ const (
6060
ASGLifecycleTerminationTaint = "aws-node-termination-handler/asg-lifecycle-termination"
6161
// RebalanceRecommendationTaint is a taint used to make spot instance unschedulable
6262
RebalanceRecommendationTaint = "aws-node-termination-handler/rebalance-recommendation"
63+
// OutOfServiceTaint is a taint used to forcefully evict pods without matching tolerations and detach persistent volumes
64+
OutOfServiceTaintKey = "node.kubernetes.io/out-of-service"
65+
OutOfServiceTaintValue = "nodeshutdown"
66+
OutOfServiceTaintEffectType = "NoExecute"
6367

6468
maxTaintValueLength = 63
6569
daemonSet = "DaemonSet"
@@ -449,7 +453,7 @@ func (n Node) TaintSpotItn(nodeName string, eventID string) error {
449453
eventID = eventID[:maxTaintValueLength]
450454
}
451455

452-
return addTaint(k8sNode, n, SpotInterruptionTaint, eventID)
456+
return addTaint(k8sNode, n, SpotInterruptionTaint, eventID, n.nthConfig.TaintEffect)
453457
}
454458

455459
// TaintASGLifecycleTermination adds the spot termination notice taint onto a node
@@ -467,7 +471,7 @@ func (n Node) TaintASGLifecycleTermination(nodeName string, eventID string) erro
467471
eventID = eventID[:maxTaintValueLength]
468472
}
469473

470-
return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID)
474+
return addTaint(k8sNode, n, ASGLifecycleTerminationTaint, eventID, n.nthConfig.TaintEffect)
471475
}
472476

473477
// TaintRebalanceRecommendation adds the rebalance recommendation notice taint onto a node
@@ -485,7 +489,7 @@ func (n Node) TaintRebalanceRecommendation(nodeName string, eventID string) erro
485489
eventID = eventID[:maxTaintValueLength]
486490
}
487491

488-
return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID)
492+
return addTaint(k8sNode, n, RebalanceRecommendationTaint, eventID, n.nthConfig.TaintEffect)
489493
}
490494

491495
// LogPods logs all the pod names on a node
@@ -527,7 +531,21 @@ func (n Node) TaintScheduledMaintenance(nodeName string, eventID string) error {
527531
eventID = eventID[:maxTaintValueLength]
528532
}
529533

530-
return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID)
534+
return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID, n.nthConfig.TaintEffect)
535+
}
536+
537+
// TaintOutOfService adds the out-of-service taint (NoExecute) onto a node
538+
func (n Node) TaintOutOfService(nodeName string) error {
539+
if !n.nthConfig.EnableOutOfServiceTaint || n.nthConfig.CordonOnly {
540+
return nil
541+
}
542+
543+
k8sNode, err := n.fetchKubernetesNode(nodeName)
544+
if err != nil {
545+
return fmt.Errorf("Unable to fetch kubernetes node from API: %w", err)
546+
}
547+
548+
return addTaint(k8sNode, n, OutOfServiceTaintKey, OutOfServiceTaintValue, OutOfServiceTaintEffectType)
531549
}
532550

533551
// RemoveNTHTaints removes NTH-specific taints from a node
@@ -750,8 +768,8 @@ func getTaintEffect(effect string) corev1.TaintEffect {
750768
}
751769
}
752770

753-
func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string) error {
754-
effect := getTaintEffect(nth.nthConfig.TaintEffect)
771+
func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, effectType string) error {
772+
effect := getTaintEffect(effectType)
755773
if nth.nthConfig.DryRun {
756774
log.Info().Msgf("Would have added taint (%s=%s:%s) to node %s, but dry-run flag was set", taintKey, taintValue, effect, nth.nthConfig.NodeName)
757775
return nil

pkg/node/node_test.go

+40
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ const (
4242
instanceId2 = "i-0wxyz5678ijkl1234"
4343
)
4444

45+
const outOfServiceTaintKey = "node.kubernetes.io/out-of-service"
46+
const outOfServiceTaintValue = "nodeshutdown"
47+
4548
var nodeName = "NAME"
4649

4750
func getDrainHelper(client *fake.Clientset) *drain.Helper {
@@ -502,3 +505,40 @@ func TestFilterOutDaemonSetPods(t *testing.T) {
502505
filteredMockPodList := tNode.FilterOutDaemonSetPods(mockPodList)
503506
h.Equals(t, 2, len(filteredMockPodList.Items))
504507
}
508+
509+
func TestTaintOutOfService(t *testing.T) {
510+
client := fake.NewSimpleClientset()
511+
_, err := client.CoreV1().Nodes().Create(
512+
context.Background(),
513+
&v1.Node{
514+
ObjectMeta: metav1.ObjectMeta{Name: nodeName},
515+
},
516+
metav1.CreateOptions{})
517+
h.Ok(t, err)
518+
519+
tNode, err := newNode(config.Config{EnableOutOfServiceTaint: true}, client)
520+
h.Ok(t, err)
521+
h.Equals(t, true, tNode.GetNthConfig().EnableOutOfServiceTaint)
522+
h.Equals(t, false, tNode.GetNthConfig().CordonOnly)
523+
524+
err = tNode.TaintOutOfService(nodeName)
525+
h.Ok(t, err)
526+
527+
updatedNode, err := client.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
528+
h.Ok(t, err)
529+
taintFound := false
530+
expectedTaint := v1.Taint{
531+
Key: outOfServiceTaintKey,
532+
Value: outOfServiceTaintValue,
533+
Effect: corev1.TaintEffectNoExecute,
534+
}
535+
for _, taint := range updatedNode.Spec.Taints {
536+
if taint.Key == expectedTaint.Key &&
537+
taint.Value == expectedTaint.Value &&
538+
taint.Effect == expectedTaint.Effect {
539+
taintFound = true
540+
break
541+
}
542+
}
543+
h.Equals(t, true, taintFound)
544+
}

test/e2e/spot-interruption-test

+20-3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ function fail_and_exit {
1717
exit "${1:-1}"
1818
}
1919

20+
function remove_out_of_service_taint {
21+
local node=$1
22+
echo "Removing out-of-service taint from node ${node}"
23+
kubectl taint nodes "${node}" node.kubernetes.io/out-of-service:NoExecute- || true
24+
}
25+
2026
echo "Starting Spot Interruption Test for Node Termination Handler"
2127

2228
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
@@ -37,6 +43,7 @@ anth_helm_args=(
3743
--set enableScheduledEventDraining="false"
3844
--set enableSpotInterruptionDraining="true"
3945
--set taintNode="true"
46+
--set enableOutOfServiceTaint="true"
4047
--set daemonsetTolerations=""
4148
--wait
4249
--force
@@ -110,6 +117,7 @@ fi
110117

111118
cordoned=0
112119
tainted=0
120+
outOfServiceTainted=0
113121
test_node=${TEST_NODE:-$CLUSTER_NAME-worker}
114122
for i in $(seq 1 $TAINT_CHECK_CYCLES); do
115123
if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled >/dev/null; then
@@ -118,13 +126,19 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
118126
fi
119127

120128
if [[ $cordoned -eq 1 && $tainted -eq 0 ]] && kubectl get nodes "${test_node}" -o json | grep -q "aws-node-termination-handler/spot-itn" >/dev/null; then
121-
echo "✅ Verified the worked node was tainted!"
122-
tainted=1
129+
echo "✅ Verified the worked node was tainted!"
130+
tainted=1
131+
fi
132+
133+
if [[ $cordoned -eq 1 && $tainted -eq 1 ]] && kubectl get nodes "${test_node}" -o json | grep -q "node.kubernetes.io/out-of-service" >/dev/null; then
134+
echo "✅ Verified the worked node was tainted as out-of-service!"
135+
outOfServiceTainted=1
123136
fi
124137

125-
if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
138+
if [[ $tainted -eq 1 && $outOfServiceTainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
126139
echo "✅ Verified the regular-pod-test pod was evicted!"
127140
echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅"
141+
remove_out_of_service_taint "${test_node}"
128142
exit 0
129143
fi
130144
echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
@@ -135,8 +149,11 @@ if [[ $cordoned -eq 0 ]]; then
135149
echo "❌ Worker node was not cordoned"
136150
elif [[ $tainted -eq 0 ]]; then
137151
echo "❌ Worker node was not tainted"
152+
elif [[ $outOfServiceTainted -eq 0 ]]; then
153+
echo "❌ Worker node was not tainted as out-of-service"
138154
else
139155
echo "❌ regular-pod-test pod was not evicted"
140156
fi
141157

158+
remove_out_of_service_taint "${test_node}"
142159
fail_and_exit 1

0 commit comments

Comments
 (0)