From 8151f1202390d531b27b716e6ea4b6895ed13343 Mon Sep 17 00:00:00 2001 From: Chun-Hung Tseng Date: Tue, 21 Jan 2025 21:41:06 +0000 Subject: [PATCH 1/5] Ground work for partially downgraded cluster cancellation Signed-off-by: Chun-Hung Tseng --- tests/e2e/cluster_downgrade_test.go | 36 ++++++++++++++++++++--------- tests/framework/e2e/downgrade.go | 11 ++++++--- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/tests/e2e/cluster_downgrade_test.go b/tests/e2e/cluster_downgrade_test.go index 341f8528249..c5abfbef7dd 100644 --- a/tests/e2e/cluster_downgrade_test.go +++ b/tests/e2e/cluster_downgrade_test.go @@ -17,12 +17,14 @@ package e2e import ( "context" "fmt" + "math/rand" "testing" "time" "github.com/coreos/go-semver/semver" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.uber.org/zap" "go.etcd.io/etcd/api/v3/version" "go.etcd.io/etcd/client/pkg/v3/fileutil" @@ -43,41 +45,50 @@ const ( noCancellation CancellationState = iota cancelRightBeforeEnable cancelRightAfterEnable + cancelAfterDowngradingSome ) func TestDowngradeUpgradeClusterOf1(t *testing.T) { - testDowngradeUpgrade(t, 1, false, noCancellation) + testDowngradeUpgrade(t, 1, 1, false, noCancellation) } func TestDowngradeUpgradeClusterOf3(t *testing.T) { - testDowngradeUpgrade(t, 3, false, noCancellation) + testDowngradeUpgrade(t, 3, 3, false, noCancellation) } func TestDowngradeUpgradeClusterOf1WithSnapshot(t *testing.T) { - testDowngradeUpgrade(t, 1, true, noCancellation) + testDowngradeUpgrade(t, 1, 1, true, noCancellation) } func TestDowngradeUpgradeClusterOf3WithSnapshot(t *testing.T) { - testDowngradeUpgrade(t, 3, true, noCancellation) + testDowngradeUpgrade(t, 3, 3, true, noCancellation) } func TestDowngradeCancellationWithoutEnablingClusterOf1(t *testing.T) { - testDowngradeUpgrade(t, 1, false, cancelRightBeforeEnable) + testDowngradeUpgrade(t, 1, 1, false, cancelRightBeforeEnable) } func TestDowngradeCancellationRightAfterEnablingClusterOf1(t *testing.T) { - testDowngradeUpgrade(t, 1, false, cancelRightAfterEnable) + testDowngradeUpgrade(t, 1, 1, false, cancelRightAfterEnable) } func TestDowngradeCancellationWithoutEnablingClusterOf3(t *testing.T) { - testDowngradeUpgrade(t, 3, false, cancelRightBeforeEnable) + testDowngradeUpgrade(t, 3, 3, false, cancelRightBeforeEnable) } func TestDowngradeCancellationRightAfterEnablingClusterOf3(t *testing.T) { - testDowngradeUpgrade(t, 3, false, cancelRightAfterEnable) + testDowngradeUpgrade(t, 3, 3, false, cancelRightAfterEnable) } -func testDowngradeUpgrade(t *testing.T, clusterSize int, triggerSnapshot bool, triggerCancellation CancellationState) { +// func TestDowngradeCancellationAfterDowngrading1InClusterOf3(t *testing.T) { +// testDowngradeUpgrade(t, 1, 3, false, cancelAfterDowngradingSome) +// } + +// func TestDowngradeCancellationAfterDowngrading2InClusterOf3(t *testing.T) { +// testDowngradeUpgrade(t, 2, 3, false, cancelAfterDowngradingSome) +// } + +func testDowngradeUpgrade(t *testing.T, cancellationSize int, clusterSize int, triggerSnapshot bool, triggerCancellation CancellationState) { currentEtcdBinary := e2e.BinPath.Etcd lastReleaseBinary := e2e.BinPath.EtcdLastRelease if !fileutil.Exist(lastReleaseBinary) { @@ -148,8 +159,11 @@ func testDowngradeUpgrade(t *testing.T, clusterSize int, triggerSnapshot bool, t return // No need to perform downgrading, end the test here } + membersToChange := rand.Perm(len(epc.Procs))[:cancellationSize] + t.Logf(fmt.Sprintf("Elect members for operations"), zap.Any("members", membersToChange)) + t.Logf("Starting downgrade process to %q", lastVersionStr) - err = e2e.DowngradeUpgradeMembers(t, nil, epc, len(epc.Procs), currentVersion, lastClusterVersion) + err = e2e.DowngradeUpgradeMembersByID(t, nil, epc, membersToChange, currentVersion, lastClusterVersion) require.NoError(t, err) e2e.AssertProcessLogs(t, leader(t, epc), "the cluster has been downgraded") @@ -176,7 +190,7 @@ func testDowngradeUpgrade(t *testing.T, clusterSize int, triggerSnapshot bool, t beforeMembers, beforeKV = getMembersAndKeys(t, cc) t.Logf("Starting upgrade process to %q", currentVersionStr) - err = e2e.DowngradeUpgradeMembers(t, nil, epc, len(epc.Procs), lastClusterVersion, currentVersion) + err = e2e.DowngradeUpgradeMembersByID(t, nil, epc, membersToChange, lastClusterVersion, currentVersion) require.NoError(t, err) t.Log("Upgrade complete") diff --git a/tests/framework/e2e/downgrade.go b/tests/framework/e2e/downgrade.go index 52a7c4c8744..0376aec9ebc 100644 --- a/tests/framework/e2e/downgrade.go +++ b/tests/framework/e2e/downgrade.go @@ -65,6 +65,13 @@ func DowngradeCancel(t *testing.T, epc *EtcdProcessCluster) { } func DowngradeUpgradeMembers(t *testing.T, lg *zap.Logger, clus *EtcdProcessCluster, numberOfMembersToChange int, currentVersion, targetVersion *semver.Version) error { + membersToChange := rand.Perm(len(clus.Procs))[:numberOfMembersToChange] + t.Logf(fmt.Sprintf("Elect members for operations"), zap.Any("members", membersToChange)) + + return DowngradeUpgradeMembersByID(t, lg, clus, membersToChange, currentVersion, targetVersion) +} + +func DowngradeUpgradeMembersByID(t *testing.T, lg *zap.Logger, clus *EtcdProcessCluster, membersToChange []int, currentVersion, targetVersion *semver.Version) error { if lg == nil { lg = clus.lg } @@ -75,8 +82,6 @@ func DowngradeUpgradeMembers(t *testing.T, lg *zap.Logger, clus *EtcdProcessClus opString = "downgrading" newExecPath = BinPath.EtcdLastRelease } - membersToChange := rand.Perm(len(clus.Procs))[:numberOfMembersToChange] - lg.Info(fmt.Sprintf("Test %s members", opString), zap.Any("members", membersToChange)) // Need to wait health interval for cluster to prepare for downgrade/upgrade time.Sleep(etcdserver.HealthInterval) @@ -100,7 +105,7 @@ func DowngradeUpgradeMembers(t *testing.T, lg *zap.Logger, clus *EtcdProcessClus lg.Info("Validating versions") for _, memberID := range membersToChange { member := clus.Procs[memberID] - if isDowngrade || numberOfMembersToChange == len(clus.Procs) { + if isDowngrade || len(membersToChange) == len(clus.Procs) { ValidateVersion(t, clus.Cfg, member, version.Versions{ Cluster: targetVersion.String(), Server: targetVersion.String(), From 412bb69ea15e08475bd9b11418a512b21b2161cb Mon Sep 17 00:00:00 2001 From: Chun-Hung Tseng Date: Tue, 21 Jan 2025 22:24:15 +0000 Subject: [PATCH 2/5] Complete partial downgraded cluster cancellation Signed-off-by: Chun-Hung Tseng --- tests/e2e/cluster_downgrade_test.go | 57 +++++++++++++++++++++-------- tests/framework/e2e/downgrade.go | 8 ++-- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/tests/e2e/cluster_downgrade_test.go b/tests/e2e/cluster_downgrade_test.go index c5abfbef7dd..90beff206da 100644 --- a/tests/e2e/cluster_downgrade_test.go +++ b/tests/e2e/cluster_downgrade_test.go @@ -45,7 +45,7 @@ const ( noCancellation CancellationState = iota cancelRightBeforeEnable cancelRightAfterEnable - cancelAfterDowngradingSome + cancelAfterDowngrading ) func TestDowngradeUpgradeClusterOf1(t *testing.T) { @@ -80,13 +80,13 @@ func TestDowngradeCancellationRightAfterEnablingClusterOf3(t *testing.T) { testDowngradeUpgrade(t, 3, 3, false, cancelRightAfterEnable) } -// func TestDowngradeCancellationAfterDowngrading1InClusterOf3(t *testing.T) { -// testDowngradeUpgrade(t, 1, 3, false, cancelAfterDowngradingSome) -// } +func TestDowngradeCancellationAfterDowngrading1InClusterOf3(t *testing.T) { + testDowngradeUpgrade(t, 1, 3, false, cancelAfterDowngrading) +} -// func TestDowngradeCancellationAfterDowngrading2InClusterOf3(t *testing.T) { -// testDowngradeUpgrade(t, 2, 3, false, cancelAfterDowngradingSome) -// } +func TestDowngradeCancellationAfterDowngrading2InClusterOf3(t *testing.T) { + testDowngradeUpgrade(t, 2, 3, false, cancelAfterDowngrading) +} func testDowngradeUpgrade(t *testing.T, cancellationSize int, clusterSize int, triggerSnapshot bool, triggerCancellation CancellationState) { currentEtcdBinary := e2e.BinPath.Etcd @@ -146,7 +146,7 @@ func testDowngradeUpgrade(t *testing.T, cancellationSize int, clusterSize int, t t.Logf("Cancelling downgrade before enabling") e2e.DowngradeCancel(t, epc) t.Log("Downgrade cancelled, validating if cluster is in the right state") - e2e.ValidateMemberVersions(t, epc, generateIdenticalVersions(clusterSize, currentVersionStr)) + e2e.ValidateMemberVersions(t, epc, generateIdenticalVersions(clusterSize, currentVersion)) return // No need to perform downgrading, end the test here } @@ -155,17 +155,19 @@ func testDowngradeUpgrade(t *testing.T, cancellationSize int, clusterSize int, t t.Logf("Cancelling downgrade right after enabling (no node is downgraded yet)") e2e.DowngradeCancel(t, epc) t.Log("Downgrade cancelled, validating if cluster is in the right state") - e2e.ValidateMemberVersions(t, epc, generateIdenticalVersions(clusterSize, currentVersionStr)) + e2e.ValidateMemberVersions(t, epc, generateIdenticalVersions(clusterSize, currentVersion)) return // No need to perform downgrading, end the test here } membersToChange := rand.Perm(len(epc.Procs))[:cancellationSize] - t.Logf(fmt.Sprintf("Elect members for operations"), zap.Any("members", membersToChange)) + t.Logf(fmt.Sprintln("Elect members for operations"), zap.Any("members", membersToChange)) t.Logf("Starting downgrade process to %q", lastVersionStr) err = e2e.DowngradeUpgradeMembersByID(t, nil, epc, membersToChange, currentVersion, lastClusterVersion) require.NoError(t, err) - e2e.AssertProcessLogs(t, leader(t, epc), "the cluster has been downgraded") + if len(membersToChange) == len(epc.Procs) { + e2e.AssertProcessLogs(t, leader(t, epc), "the cluster has been downgraded") + } t.Log("Downgrade complete") afterMembers, afterKV := getMembersAndKeys(t, cc) @@ -176,6 +178,13 @@ func testDowngradeUpgrade(t *testing.T, cancellationSize int, clusterSize int, t t.Log("Waiting health interval to required to make membership changes") time.Sleep(etcdserver.HealthInterval) } + + if triggerCancellation == cancelAfterDowngrading { + e2e.DowngradeCancel(t, epc) + t.Log("Downgrade cancelled, validating if cluster is in the right state") + e2e.ValidateMemberVersions(t, epc, generatePartialCancellationVersions(clusterSize, membersToChange, lastClusterVersion)) + } + t.Log("Adding learner to test membership, but avoid breaking quorum") resp, err = cc.MemberAddAsLearner(context.Background(), "fake2", []string{"http://127.0.0.1:1002"}) require.NoError(t, err) @@ -290,16 +299,34 @@ func getMembersAndKeys(t *testing.T, cc *e2e.EtcdctlV3) (*clientv3.MemberListRes return members, kvs } -func generateIdenticalVersions(clusterSize int, currentVersion string) []*version.Versions { +func generateIdenticalVersions(clusterSize int, ver *semver.Version) []*version.Versions { + ret := make([]*version.Versions, clusterSize) + + for i := range clusterSize { + ret[i] = &version.Versions{ + Cluster: ver.String(), + Server: ver.String(), + Storage: ver.String(), + } + } + + return ret +} + +func generatePartialCancellationVersions(clusterSize int, membersToChange []int, ver *semver.Version) []*version.Versions { ret := make([]*version.Versions, clusterSize) for i := range clusterSize { ret[i] = &version.Versions{ - Cluster: currentVersion, - Server: currentVersion, - Storage: currentVersion, + Cluster: ver.String(), + Server: e2e.OffsetMinor(ver, 1).String(), + Storage: "", } } + for i := range membersToChange { + ret[membersToChange[i]].Server = ver.String() + } + return ret } diff --git a/tests/framework/e2e/downgrade.go b/tests/framework/e2e/downgrade.go index 0376aec9ebc..e521012881a 100644 --- a/tests/framework/e2e/downgrade.go +++ b/tests/framework/e2e/downgrade.go @@ -44,7 +44,7 @@ func DowngradeEnable(t *testing.T, epc *EtcdProcessCluster, ver *semver.Version) for i := 0; i < len(epc.Procs); i++ { ValidateVersion(t, epc.Cfg, epc.Procs[i], version.Versions{ Cluster: ver.String(), - Server: offsetMinor(ver, 1).String(), + Server: OffsetMinor(ver, 1).String(), Storage: ver.String(), }) AssertProcessLogs(t, epc.Procs[i], "The server is ready to downgrade") @@ -66,7 +66,7 @@ func DowngradeCancel(t *testing.T, epc *EtcdProcessCluster) { func DowngradeUpgradeMembers(t *testing.T, lg *zap.Logger, clus *EtcdProcessCluster, numberOfMembersToChange int, currentVersion, targetVersion *semver.Version) error { membersToChange := rand.Perm(len(clus.Procs))[:numberOfMembersToChange] - t.Logf(fmt.Sprintf("Elect members for operations"), zap.Any("members", membersToChange)) + t.Logf(fmt.Sprintln("Elect members for operations"), zap.Any("members", membersToChange)) return DowngradeUpgradeMembersByID(t, lg, clus, membersToChange, currentVersion, targetVersion) } @@ -148,8 +148,8 @@ func ValidateVersion(t *testing.T, cfg *EtcdProcessClusterConfig, member EtcdPro }) } -// offsetMinor returns the version with offset from the original minor, with the same major. -func offsetMinor(v *semver.Version, offset int) *semver.Version { +// OffsetMinor returns the version with offset from the original minor, with the same major. +func OffsetMinor(v *semver.Version, offset int) *semver.Version { var minor int64 if offset >= 0 { minor = v.Minor + int64(offset) From 01690d8c663b3c15b7a972b16af76f7b99353918 Mon Sep 17 00:00:00 2001 From: Chun-Hung Tseng Date: Wed, 29 Jan 2025 10:11:43 +0000 Subject: [PATCH 3/5] Rename cancellationSize to numberOfMembersToDowngrade, and set it 0 for cases like cancelRightAfterEnable Signed-off-by: Chun-Hung Tseng --- tests/e2e/cluster_downgrade_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/e2e/cluster_downgrade_test.go b/tests/e2e/cluster_downgrade_test.go index 90beff206da..8b6b967fe6b 100644 --- a/tests/e2e/cluster_downgrade_test.go +++ b/tests/e2e/cluster_downgrade_test.go @@ -65,19 +65,19 @@ func TestDowngradeUpgradeClusterOf3WithSnapshot(t *testing.T) { } func TestDowngradeCancellationWithoutEnablingClusterOf1(t *testing.T) { - testDowngradeUpgrade(t, 1, 1, false, cancelRightBeforeEnable) + testDowngradeUpgrade(t, 0, 1, false, cancelRightBeforeEnable) } func TestDowngradeCancellationRightAfterEnablingClusterOf1(t *testing.T) { - testDowngradeUpgrade(t, 1, 1, false, cancelRightAfterEnable) + testDowngradeUpgrade(t, 0, 1, false, cancelRightAfterEnable) } func TestDowngradeCancellationWithoutEnablingClusterOf3(t *testing.T) { - testDowngradeUpgrade(t, 3, 3, false, cancelRightBeforeEnable) + testDowngradeUpgrade(t, 0, 3, false, cancelRightBeforeEnable) } func TestDowngradeCancellationRightAfterEnablingClusterOf3(t *testing.T) { - testDowngradeUpgrade(t, 3, 3, false, cancelRightAfterEnable) + testDowngradeUpgrade(t, 0, 3, false, cancelRightAfterEnable) } func TestDowngradeCancellationAfterDowngrading1InClusterOf3(t *testing.T) { @@ -88,7 +88,7 @@ func TestDowngradeCancellationAfterDowngrading2InClusterOf3(t *testing.T) { testDowngradeUpgrade(t, 2, 3, false, cancelAfterDowngrading) } -func testDowngradeUpgrade(t *testing.T, cancellationSize int, clusterSize int, triggerSnapshot bool, triggerCancellation CancellationState) { +func testDowngradeUpgrade(t *testing.T, numberOfMembersToDowngrade int, clusterSize int, triggerSnapshot bool, triggerCancellation CancellationState) { currentEtcdBinary := e2e.BinPath.Etcd lastReleaseBinary := e2e.BinPath.EtcdLastRelease if !fileutil.Exist(lastReleaseBinary) { @@ -159,7 +159,7 @@ func testDowngradeUpgrade(t *testing.T, cancellationSize int, clusterSize int, t return // No need to perform downgrading, end the test here } - membersToChange := rand.Perm(len(epc.Procs))[:cancellationSize] + membersToChange := rand.Perm(len(epc.Procs))[:numberOfMembersToDowngrade] t.Logf(fmt.Sprintln("Elect members for operations"), zap.Any("members", membersToChange)) t.Logf("Starting downgrade process to %q", lastVersionStr) From a998bec9a9637b4292e68efd78e49171073965ac Mon Sep 17 00:00:00 2001 From: Chun-Hung Tseng Date: Wed, 29 Jan 2025 11:03:12 +0000 Subject: [PATCH 4/5] Add more test cases Signed-off-by: Chun-Hung Tseng --- tests/e2e/cluster_downgrade_test.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/e2e/cluster_downgrade_test.go b/tests/e2e/cluster_downgrade_test.go index 8b6b967fe6b..b8d7581ac28 100644 --- a/tests/e2e/cluster_downgrade_test.go +++ b/tests/e2e/cluster_downgrade_test.go @@ -88,6 +88,22 @@ func TestDowngradeCancellationAfterDowngrading2InClusterOf3(t *testing.T) { testDowngradeUpgrade(t, 2, 3, false, cancelAfterDowngrading) } +func TestDowngradeCancellationAfterDowngrading1InClusterOf5(t *testing.T) { + testDowngradeUpgrade(t, 1, 5, false, cancelAfterDowngrading) +} + +func TestDowngradeCancellationAfterDowngrading2InClusterOf5(t *testing.T) { + testDowngradeUpgrade(t, 2, 5, false, cancelAfterDowngrading) +} + +func TestDowngradeCancellationAfterDowngrading3InClusterOf5(t *testing.T) { + testDowngradeUpgrade(t, 3, 5, false, cancelAfterDowngrading) +} + +func TestDowngradeCancellationAfterDowngrading4InClusterOf5(t *testing.T) { + testDowngradeUpgrade(t, 4, 5, false, cancelAfterDowngrading) +} + func testDowngradeUpgrade(t *testing.T, numberOfMembersToDowngrade int, clusterSize int, triggerSnapshot bool, triggerCancellation CancellationState) { currentEtcdBinary := e2e.BinPath.Etcd lastReleaseBinary := e2e.BinPath.EtcdLastRelease From bffb8f962459e023bd576f466be5301f326a13c3 Mon Sep 17 00:00:00 2001 From: Chun-Hung Tseng Date: Wed, 29 Jan 2025 19:09:37 +0000 Subject: [PATCH 5/5] Attempt to fix timeout flake Signed-off-by: Chun-Hung Tseng --- tests/e2e/cluster_downgrade_test.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/e2e/cluster_downgrade_test.go b/tests/e2e/cluster_downgrade_test.go index b8d7581ac28..215020c9a90 100644 --- a/tests/e2e/cluster_downgrade_test.go +++ b/tests/e2e/cluster_downgrade_test.go @@ -190,17 +190,18 @@ func testDowngradeUpgrade(t *testing.T, numberOfMembersToDowngrade int, clusterS assert.Equal(t, beforeKV.Kvs, afterKV.Kvs) assert.Equal(t, beforeMembers.Members, afterMembers.Members) - if len(epc.Procs) > 1 { - t.Log("Waiting health interval to required to make membership changes") - time.Sleep(etcdserver.HealthInterval) - } - if triggerCancellation == cancelAfterDowngrading { + time.Sleep(1 * time.Second) e2e.DowngradeCancel(t, epc) t.Log("Downgrade cancelled, validating if cluster is in the right state") e2e.ValidateMemberVersions(t, epc, generatePartialCancellationVersions(clusterSize, membersToChange, lastClusterVersion)) } + if len(epc.Procs) > 1 { + t.Log("Waiting health interval to required to make membership changes") + time.Sleep(etcdserver.HealthInterval) + } + t.Log("Adding learner to test membership, but avoid breaking quorum") resp, err = cc.MemberAddAsLearner(context.Background(), "fake2", []string{"http://127.0.0.1:1002"}) require.NoError(t, err)