Skip to content

Commit 55208bd

Browse files
cameronmeissnerCameron Meissner
and
Cameron Meissner
authored
perf: bootstrap credential validation with curl and check apiserver connectivity before starting kubelet (#6023)
Co-authored-by: Cameron Meissner <[email protected]>
1 parent 256b8c2 commit 55208bd

File tree

556 files changed

+10183
-1221
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

556 files changed

+10183
-1221
lines changed

e2e/validation.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"encoding/base64"
66
"fmt"
7+
"strings"
78

89
"github.com/Azure/agentbaker/e2e/config"
910
"github.com/stretchr/testify/require"
@@ -38,6 +39,13 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
3839
stdout := execResult.stdout.String()
3940
require.NotContains(s.T, stdout, "--dynamic-config-dir", "kubelet flag '--dynamic-config-dir' should not be present in /etc/default/kubelet\nContents:\n%s")
4041

42+
kubeletLogs := execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo journalctl -u kubelet", 0, "could not retrieve kubelet logs with journalctl").stdout.String()
43+
require.True(
44+
s.T,
45+
!strings.Contains(kubeletLogs, "unable to validate bootstrap credentials") && strings.Contains(kubeletLogs, "kubelet bootstrap token credential is valid"),
46+
"expected to have successfully validated bootstrap token credential before kubelet startup, but did not",
47+
)
48+
4149
// the instructions belows expects the SSH key to be uploaded to the user pool VM.
4250
// which happens as a side-effect of execCommandOnVMForScenario, it's ugly but works.
4351
// maybe we should use a single ssh key per cluster, but need to be careful with parallel test runs.
@@ -62,7 +70,7 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) {
6270
//"cloud-config.txt", // file with UserData
6371
})
6472

65-
execResult = execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo curl http://168.63.129.16:32526/vmSettings", 0, "curl to wireserver failed")
73+
_ = execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo curl http://168.63.129.16:32526/vmSettings", 0, "curl to wireserver failed")
6674

6775
execResult = execOnVMForScenarioOnUnprivilegedPod(ctx, s, "curl https://168.63.129.16/machine/?comp=goalstate -H 'x-ms-version: 2015-04-05' -s --connect-timeout 4")
6876
require.Equal(s.T, "28", execResult.exitCode, "curl to wireserver should fail")

e2e/validators.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@ import (
44
"bytes"
55
"context"
66
"fmt"
7-
"github.com/tidwall/gjson"
87
"net"
98
"os"
109
"regexp"
1110
"strings"
1211
"time"
1312

13+
"github.com/tidwall/gjson"
14+
1415
"github.com/Azure/agentbaker/e2e/config"
1516
"github.com/stretchr/testify/assert"
1617
"github.com/stretchr/testify/require"
@@ -301,7 +302,7 @@ func ValidateContainerdWASMShims(ctx context.Context, s *Scenario) {
301302

302303
func ValidateKubeletHasNotStopped(ctx context.Context, s *Scenario) {
303304
command := "sudo journalctl -u kubelet"
304-
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not retrieve kubelet logs")
305+
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, "could not retrieve kubelet logs with journalctl")
305306
assert.NotContains(s.T, execResult.stdout.String(), "Stopped Kubelet")
306307
assert.Contains(s.T, execResult.stdout.String(), "Started Kubelet")
307308
}
@@ -314,7 +315,7 @@ func ValidateServicesDoNotRestartKubelet(ctx context.Context, s *Scenario) {
314315

315316
// ValidateKubeletHasFlags checks kubelet is started with the right flags and configs.
316317
func ValidateKubeletHasFlags(ctx context.Context, s *Scenario, filePath string) {
317-
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo journalctl -u kubelet", 0, "could not get kubelet logs")
318+
execResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, "sudo journalctl -u kubelet", 0, "could not retrieve kubelet logs with journalctl")
318319
configFileFlags := fmt.Sprintf("FLAG: --config=\"%s\"", filePath)
319320
require.Containsf(s.T, execResult.stdout.String(), configFileFlags, "expected to find flag %s, but not found", "config")
320321
}

parts/linux/cloud-init/artifacts/cse_config.sh

+7-5
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,13 @@ configureKubeletServing() {
492492
fi
493493
}
494494

495+
ensureKubeCACert() {
496+
KUBE_CA_FILE="/etc/kubernetes/certs/ca.crt"
497+
mkdir -p "$(dirname "${KUBE_CA_FILE}")"
498+
echo "${KUBE_CA_CRT}" | base64 -d > "${KUBE_CA_FILE}"
499+
chmod 0600 "${KUBE_CA_FILE}"
500+
}
501+
495502
ensureKubelet() {
496503
KUBELET_DEFAULT_FILE=/etc/default/kubelet
497504
mkdir -p /etc/default
@@ -516,11 +523,6 @@ ensureKubelet() {
516523
echo "AZURE_ENVIRONMENT_FILEPATH=${AZURE_ENVIRONMENT_FILEPATH}" >> "${KUBELET_DEFAULT_FILE}"
517524
fi
518525
chmod 0600 "${KUBELET_DEFAULT_FILE}"
519-
520-
KUBE_CA_FILE="/etc/kubernetes/certs/ca.crt"
521-
mkdir -p "$(dirname "${KUBE_CA_FILE}")"
522-
echo "${KUBE_CA_CRT}" | base64 -d > "${KUBE_CA_FILE}"
523-
chmod 0600 "${KUBE_CA_FILE}"
524526

525527
if [ "${ENABLE_SECURE_TLS_BOOTSTRAPPING}" == "true" ] || [ -n "${TLS_BOOTSTRAP_TOKEN}" ]; then
526528
KUBELET_TLS_DROP_IN="/etc/systemd/system/kubelet.service.d/10-tlsbootstrap.conf"

parts/linux/cloud-init/artifacts/cse_main.sh

+10-5
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ source "${CSE_INSTALL_FILEPATH}"
5252
source "${CSE_DISTRO_INSTALL_FILEPATH}"
5353
source "${CSE_CONFIG_FILEPATH}"
5454

55+
logs_to_events "AKS.CSE.ensureKubeCACert" ensureKubeCACert
56+
5557
if [[ "${DISABLE_SSH}" == "true" ]]; then
5658
disableSSH || exit $ERR_DISABLE_SSH
5759
fi
@@ -337,7 +339,6 @@ if [ "${NEEDS_CONTAINERD}" == "true" ] && [ "${SHOULD_CONFIG_CONTAINERD_ULIMITS
337339
logs_to_events "AKS.CSE.setContainerdUlimits" configureContainerdUlimits
338340
fi
339341

340-
logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet
341342
if [ "${ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE}" == "true" ]; then
342343
logs_to_events "AKS.CSE.ensureNoDupOnPromiscuBridge" ensureNoDupOnPromiscuBridge
343344
fi
@@ -401,6 +402,13 @@ else
401402
logs_to_events "AKS.CSE.apiserverNC" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 nc -vz ${API_SERVER_NAME} 443" || time nc -vz ${API_SERVER_NAME} 443 || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL
402403
fi
403404

405+
echo "API server connection check code: $VALIDATION_ERR"
406+
if [ $VALIDATION_ERR -ne 0 ]; then
407+
exit $VALIDATION_ERR
408+
fi
409+
410+
logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet
411+
404412
if [[ ${ID} != "mariner" ]] && [[ ${ID} != "azurelinux" ]]; then
405413
echo "Recreating man-db auto-update flag file and kicking off man-db update process at $(date)"
406414
createManDbAutoUpdateFlagFile
@@ -458,10 +466,7 @@ else
458466
fi
459467
fi
460468

461-
echo "Custom script finished. API server connection check code:" $VALIDATION_ERR
469+
echo "Custom script finished."
462470
echo $(date),$(hostname), endcustomscript>>/opt/m
463471

464-
exit $VALIDATION_ERR
465-
466-
467472
#EOF

parts/linux/cloud-init/artifacts/kubelet.service

+2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ ExecStartPre=/bin/mount --make-shared /var/lib/kubelet
2020
ExecStartPre=-/sbin/ebtables -t nat --list
2121
ExecStartPre=-/sbin/iptables -t nat --numeric --list
2222

23+
ExecStartPre=/bin/bash /opt/azure/containers/validate-kubelet-credentials.sh
24+
2325
ExecStart=/usr/local/bin/kubelet \
2426
--enable-server \
2527
--node-labels="${KUBELET_NODE_LABELS}" \
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
# this gives us logs_to_events
5+
source /opt/azure/containers/provision_source.sh
6+
7+
KUBECONFIG_PATH="${KUBECONFIG_PATH:-/var/lib/kubelet/kubeconfig}"
8+
BOOTSTRAP_KUBECONFIG_PATH="${BOOTSTRAP_KUBECONFIG_PATH:-/var/lib/kubelet/bootstrap-kubeconfig}"
9+
10+
MAX_RETRIES=${VALIDATE_KUBELET_CREDENTIALS_MAX_RETRIES:-30}
11+
RETRY_DELAY_SECONDS=${VALIDATE_KUBELET_CREDENTIALS_RETRY_DELAY_SECONDS:-2}
12+
RETRY_TIMEOUT_SECONDS=${VALIDATE_KUBELET_CREDENTIALS_RETRY_TIMEOUT_SECONDS:-5}
13+
14+
function validateBootstrapKubeconfig {
15+
local kubeconfig_path=$1
16+
17+
cacert=$(grep -Po "(?<=certificate-authority: ).*$" < "$kubeconfig_path")
18+
apiserver_url=$(grep -Po "(?<=server: ).*$" < "$kubeconfig_path")
19+
bootstrap_token=$(grep -Po "(?<=token: ).*$" < "$kubeconfig_path")
20+
21+
if [ -z "$cacert" ]; then
22+
echo "could not read cluster CA file path from $kubeconfig_path, unable to validate bootstrap credentials"
23+
exit 0
24+
fi
25+
if [ -z "$apiserver_url" ]; then
26+
echo "could not read apiserver URL from $kubeconfig_path, unable to validate bootstrap credentials"
27+
exit 0
28+
fi
29+
if [ -z "$bootstrap_token" ]; then
30+
echo "could not read bootstrap token from $kubeconfig_path, unable to validate bootstrap credentials"
31+
exit 0
32+
fi
33+
34+
local retry_count=0
35+
while true; do
36+
code=$(curl -sL \
37+
-m $RETRY_TIMEOUT_SECONDS \
38+
-o /dev/null \
39+
-w "%{http_code}" \
40+
-H "Accept: application/json, */*" \
41+
-H "Authorization: Bearer ${bootstrap_token//\"/}" \
42+
--cacert "$cacert" \
43+
"${apiserver_url}/version?timeout=${RETRY_TIMEOUT_SECONDS}s")
44+
45+
if [ $code -ge 200 ] && [ $code -lt 400 ]; then
46+
echo "(retry=$retry_count) received valid HTTP status code from apiserver: $code"
47+
break
48+
fi
49+
50+
echo "(retry=$retry_count) received invalid HTTP status code from apiserver: $code"
51+
52+
retry_count=$(( $retry_count + 1 ))
53+
if [ $retry_count -eq $MAX_RETRIES ]; then
54+
echo "unable to validate bootstrap credentials after $retry_count attempts"
55+
exit 0
56+
fi
57+
58+
sleep $RETRY_DELAY_SECONDS
59+
done
60+
}
61+
62+
function validateKubeletCredentials {
63+
if [ -f "$KUBECONFIG_PATH" ]; then
64+
echo "client credential already exists within kubeconfig: $KUBECONFIG_PATH, no need to validate bootstrap credentials"
65+
exit 0
66+
fi
67+
68+
if [ ! -f "$BOOTSTRAP_KUBECONFIG_PATH" ]; then
69+
echo "no bootstrap-kubeconfig found at $BOOTSTRAP_KUBECONFIG_PATH, no bootstrap credentials to validate"
70+
exit 0
71+
fi
72+
73+
if ! which curl >/dev/null 2>&1; then
74+
echo "curl is not available, unable to validate bootstrap credentials"
75+
exit 0
76+
fi
77+
78+
echo "will validate bootstrap-kubeconfig: $BOOTSTRAP_KUBECONFIG_PATH"
79+
validateBootstrapKubeconfig "$BOOTSTRAP_KUBECONFIG_PATH"
80+
echo "kubelet bootstrap token credential is valid"
81+
}
82+
83+
logs_to_events "AKS.Runtime.validateKubeletCredentials" validateKubeletCredentials

parts/linux/cloud-init/nodecustomdata.yml

+7
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,13 @@ write_files:
409409
content: !!binary |
410410
{{GetVariableProperty "cloudInitData" "ensureIMDSRestrictionScript"}}
411411

412+
- path: /opt/azure/containers/validate-kubelet-credentials.sh
413+
permissions: "0755"
414+
encoding: gzip
415+
owner: root
416+
content: !!binary |
417+
{{GetVariableProperty "cloudInitData" "validateKubeletCredentialsScript"}}
418+
412419
- path: /etc/kubernetes/certs/ca.crt
413420
permissions: "0600"
414421
encoding: base64

pkg/agent/baker_test.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -787,7 +787,7 @@ var _ = Describe("Assert generated customData and cseCmd", func() {
787787
config.KubeletConfig = map[string]string{}
788788
}, nil),
789789

790-
Entry("AKSUbuntu1804 with kubelet client certificatet", "AKSUbuntu1804+WithKubeletClientCert", "1.18.3",
790+
Entry("AKSUbuntu1804 with kubelet client certificate", "AKSUbuntu1804+WithKubeletClientCert", "1.18.3",
791791
func(config *datamodel.NodeBootstrappingConfiguration) {
792792
config.ContainerService.Properties.CertificateProfile = &datamodel.CertificateProfile{
793793
ClientCertificate: "fooBarBaz",
@@ -798,12 +798,14 @@ var _ = Describe("Assert generated customData and cseCmd", func() {
798798
etcDefaultKubelet := o.files["/etc/default/kubelet"].value
799799
etcDefaultKubeletService := o.files["/etc/systemd/system/kubelet.service"].value
800800
kubeletSh := o.files["/opt/azure/containers/kubelet.sh"].value
801+
validateCredentials := o.files["/opt/azure/containers/validate-kubelet-credentials.sh"].value
801802
caCRT := o.files["/etc/kubernetes/certs/ca.crt"].value
802803
kubeconfig := o.files["/var/lib/kubelet/kubeconfig"].value
803804

804805
Expect(etcDefaultKubelet).NotTo(BeEmpty())
805806
Expect(etcDefaultKubeletService).NotTo(BeEmpty())
806807
Expect(kubeletSh).NotTo(BeEmpty())
808+
Expect(validateCredentials).ToNot(BeEmpty())
807809
Expect(caCRT).NotTo(BeEmpty())
808810
Expect(kubeconfig).ToNot(BeEmpty())
809811

@@ -822,13 +824,15 @@ var _ = Describe("Assert generated customData and cseCmd", func() {
822824
etcDefaultKubelet := o.files["/etc/default/kubelet"].value
823825
etcDefaultKubeletService := o.files["/etc/systemd/system/kubelet.service"].value
824826
kubeletSh := o.files["/opt/azure/containers/kubelet.sh"].value
827+
validateCredentials := o.files["/opt/azure/containers/validate-kubelet-credentials.sh"].value
825828
bootstrapKubeconfig := o.files["/var/lib/kubelet/bootstrap-kubeconfig"].value
826829
caCRT := o.files["/etc/kubernetes/certs/ca.crt"].value
827830

828831
Expect(etcDefaultKubelet).NotTo(BeEmpty())
829832
Expect(bootstrapKubeconfig).NotTo(BeEmpty())
830833
Expect(kubeletSh).NotTo(BeEmpty())
831834
Expect(etcDefaultKubeletService).NotTo(BeEmpty())
835+
Expect(validateCredentials).ToNot(BeEmpty())
832836
Expect(caCRT).NotTo(BeEmpty())
833837

834838
Expect(bootstrapKubeconfig).To(ContainSubstring("token"))

pkg/agent/const.go

+1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ const (
7373
migPartitionScript = "linux/cloud-init/artifacts/mig-partition.sh"
7474
migPartitionSystemdService = "linux/cloud-init/artifacts/mig-partition.service"
7575
ensureIMDSRestrictionScript = "linux/cloud-init/artifacts/ensure_imds_restriction.sh"
76+
validateKubeletCredentialsScript = "linux/cloud-init/artifacts/validate-kubelet-credentials.sh"
7677

7778
// scripts and service for enabling ipv6 dual stack.
7879
dhcpv6SystemdService = "linux/cloud-init/artifacts/dhcpv6.service"

0 commit comments

Comments
 (0)