From 757ff45c2be3dc2b958c191088a66bb28ed31aea Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Thu, 27 Feb 2025 20:22:40 -0500 Subject: [PATCH 1/3] OCPBUGS-23514: Failing=Unknown upon long CO updating When it takes too long (90m+ for machine-config and 30m+ for others) to upgrade a cluster operator, clusterversion shows a message with the indication that the upgrade might hit some issue. This will cover the case in the related OCPBUGS-23538: for some reason, the pod under the deployment that manages the CO hit CrashLoopBackOff. Deployment controller does not give useful conditions in this situation [1]. Otherwise, checkDeploymentHealth [2] would detect it. Instead of CVO's figuring out the underlying pod's CrashLoopBackOff which might be better to be implemented by deployment controller, it is expected that our cluster admin starts to dig into the cluster when such a message pops up. In addition to the condition's message. We propagate Fail=Unknown to make it available for other automations, such as update-status command. [1]. https://github.com/kubernetes/kubernetes/issues/106054 [2]. https://github.com/openshift/cluster-version-operator/blob/08c0459df5096e9f16fad3af2831b62d06d415ee/lib/resourcebuilder/apps.go#L79-L136 --- pkg/cvo/status.go | 44 ++++++- pkg/cvo/status_test.go | 266 +++++++++++++++++++++++++++++++++++++++++ pkg/payload/task.go | 8 +- 3 files changed, 316 insertions(+), 2 deletions(-) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index f6a433a0a..b0b64f554 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -321,6 +321,15 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status failingCondition.Reason = failingReason failingCondition.Message = failingMessage } + if failure != nil && + skipFailure && + (progressReason == "ClusterOperatorUpdating" || progressReason == "ClusterOperatorsUpdating") && + strings.HasPrefix(progressMessage, slowCOUpdatePrefix) { + progressMessage = strings.TrimPrefix(progressMessage, slowCOUpdatePrefix) + failingCondition.Status = configv1.ConditionUnknown + failingCondition.Reason = "SlowClusterOperator" + failingCondition.Message = progressMessage + } resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition) // update progressing @@ -531,6 +540,8 @@ func setDesiredReleaseAcceptedCondition(cvStatus *configv1.ClusterVersionStatus, } } +const slowCOUpdatePrefix = "Slow::" + // convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as // still making internal progress. The general error we try to suppress is an operator or operators still being // progressing AND the general payload task making progress towards its goal. The error's UpdateEffect determines @@ -549,7 +560,38 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin case payload.UpdateEffectReport: return uErr.Reason, uErr.Error(), false case payload.UpdateEffectNone: - return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true + var exceeded []string + names := uErr.Names + if len(names) == 0 { + names = []string{uErr.Name} + } + var machineConfig bool + for _, name := range names { + m := 30 * time.Minute + // It takes longer to upgrade MCO + if name == "machine-config" { + m = 3 * m + } + t := payload.COUpdateStartTimesGet(name) + if (!t.IsZero()) && t.Before(now.Add(-(m))) { + if name == "machine-config" { + machineConfig = true + } else { + exceeded = append(exceeded, name) + } + } + } + // returns true in those slow cases because it is still only a suspicion + if len(exceeded) > 0 && !machineConfig { + return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true + } + if len(exceeded) > 0 && machineConfig { + return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true + } + if len(exceeded) == 0 && machineConfig { + return uErr.Reason, fmt.Sprintf("%swaiting on machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix), true + } + return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true case payload.UpdateEffectFail: return "", "", false case payload.UpdateEffectFailAfterInterval: diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index 1403c71f9..3f9f13d9b 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -5,6 +5,7 @@ import ( "fmt" "reflect" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -353,12 +354,27 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t type args struct { syncWorkerStatus *SyncWorkerStatus } + payload.COUpdateStartTimesEnsure("co-not-timeout") + payload.COUpdateStartTimesEnsure("co-bar-not-timeout") + payload.COUpdateStartTimesEnsure("co-foo-not-timeout") + payload.COUpdateStartTimesAt("co-timeout", time.Now().Add(-60*time.Minute)) + payload.COUpdateStartTimesAt("co-bar-timeout", time.Now().Add(-60*time.Minute)) + defer func() { + payload.COUpdateStartTimesRemove("co-not-timeout") + payload.COUpdateStartTimesRemove("co-bar-not-timeout") + payload.COUpdateStartTimesRemove("co-foo-not-timeout") + payload.COUpdateStartTimesRemove("co-timeout") + payload.COUpdateStartTimesRemove("co-bar-timeout") + }() + tests := []struct { name string args args shouldModifyWhenNotReconcilingAndHistoryNotEmpty bool expectedConditionNotModified *configv1.ClusterOperatorStatusCondition expectedConditionModified *configv1.ClusterOperatorStatusCondition + machineConfigTimeout bool + expectedProgressingCondition *configv1.ClusterOperatorStatusCondition }{ { name: "no errors are present", @@ -389,10 +405,12 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t name: "single UpdateEffectNone error", args: args{ syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, Failure: &payload.UpdateError{ UpdateEffect: payload.UpdateEffectNone, Reason: "ClusterOperatorUpdating", Message: "Cluster operator A is updating", + Name: "co-not-timeout", }, }, }, @@ -407,6 +425,110 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t Type: ClusterStatusFailing, Status: configv1.ConditionFalse, }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on co-not-timeout", + }, + }, + { + name: "single UpdateEffectNone error and timeout", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + Name: "co-timeout", + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout over 30 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected", + }, + }, + { + name: "single UpdateEffectNone error and machine-config", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + Name: "machine-config", + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionFalse, + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on machine-config", + }, + }, + { + name: "single UpdateEffectNone error and machine-config timeout", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + Name: "machine-config", + }, + }, + }, + machineConfigTimeout: true, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Cluster operator A is updating", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on machine-config over 90 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorUpdating", + Message: "Working towards 1.2.3: waiting on machine-config over 90 minutes which is longer than expected", + }, }, { name: "single condensed UpdateEffectFail UpdateError", @@ -582,6 +704,137 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t Message: "Multiple errors are preventing progress:\n* Cluster operator A is not available\n* Cluster operator C is degraded", }, }, + { + name: "MultipleErrors: all updating and none slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-not-timeout", "co-bar-not-timeout", "co-foo-not-timeout"}, + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionFalse, + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-not-timeout, co-bar-not-timeout, co-foo-not-timeout", + }, + }, + { + name: "MultipleErrors: all updating and one slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-timeout", "co-bar-not-timeout", "co-foo-not-timeout"}, + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout over 30 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout over 30 minutes which is longer than expected", + }, + }, + { + name: "MultipleErrors: all updating and some slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-timeout", "co-bar-timeout", "co-foo-not-timeout"}, + }, + }, + }, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected", + }, + }, + { + name: "MultipleErrors: all updating and all slow", + args: args{ + syncWorkerStatus: &SyncWorkerStatus{ + Actual: configv1.Release{Version: "1.2.3"}, + Failure: &payload.UpdateError{ + UpdateEffect: payload.UpdateEffectNone, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + Names: []string{"co-timeout", "co-bar-timeout", "machine-config"}, + }, + }, + }, + machineConfigTimeout: true, + expectedConditionNotModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "some-message", + }, + shouldModifyWhenNotReconcilingAndHistoryNotEmpty: true, + expectedConditionModified: &configv1.ClusterOperatorStatusCondition{ + Type: ClusterStatusFailing, + Status: configv1.ConditionUnknown, + Reason: "SlowClusterOperator", + Message: "waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected", + }, + expectedProgressingCondition: &configv1.ClusterOperatorStatusCondition{ + Type: configv1.OperatorProgressing, + Status: configv1.ConditionTrue, + Reason: "ClusterOperatorsUpdating", + Message: "Working towards 1.2.3: waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected", + }, + }, } for _, tc := range tests { tc := tc @@ -601,6 +854,11 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t {true, false}, {true, true}, } + if tc.machineConfigTimeout { + payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-120*time.Minute)) + } else { + payload.COUpdateStartTimesAt("machine-config", time.Now().Add(-60*time.Minute)) + } for _, c := range combinations { tc.args.syncWorkerStatus.Reconciling = c.isReconciling cvStatus := &configv1.ClusterVersionStatus{} @@ -616,7 +874,15 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t if diff := cmp.Diff(expectedCondition, condition, ignoreLastTransitionTime); diff != "" { t.Errorf("unexpected condition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff) } + + if tc.expectedProgressingCondition != nil && !c.isReconciling && !c.isHistoryEmpty { + progressingCondition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, configv1.OperatorProgressing) + if diff := cmp.Diff(tc.expectedProgressingCondition, progressingCondition, ignoreLastTransitionTime); diff != "" { + t.Errorf("unexpected progressingCondition when Reconciling == %t && isHistoryEmpty == %t\n:%s", c.isReconciling, c.isHistoryEmpty, diff) + } + } } + payload.COUpdateStartTimesRemove("machine-config") }) } } diff --git a/pkg/payload/task.go b/pkg/payload/task.go index 1ac592a56..a260573af 100644 --- a/pkg/payload/task.go +++ b/pkg/payload/task.go @@ -47,9 +47,15 @@ func InitCOUpdateStartTimes() { // COUpdateStartTimesEnsure adds name to clusterOperatorUpdateStartTimes map and sets to // current time if name does not already exist in map. func COUpdateStartTimesEnsure(name string) { + COUpdateStartTimesAt(name, time.Now()) +} + +// COUpdateStartTimesAt adds name to clusterOperatorUpdateStartTimes map and sets to +// t if name does not already exist in map. +func COUpdateStartTimesAt(name string, t time.Time) { clusterOperatorUpdateStartTimes.lock.Lock() if _, ok := clusterOperatorUpdateStartTimes.m[name]; !ok { - clusterOperatorUpdateStartTimes.m[name] = time.Now() + clusterOperatorUpdateStartTimes.m[name] = t } clusterOperatorUpdateStartTimes.lock.Unlock() } From 21142e004017c9b6dfe19345a05373d21b667dad Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Wed, 26 Mar 2025 09:19:39 -0400 Subject: [PATCH 2/3] Move slowCOUpdatePrefix Reason instead of Message --- pkg/cvo/status.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index b0b64f554..3f2de898f 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -322,14 +322,13 @@ func updateClusterVersionStatus(cvStatus *configv1.ClusterVersionStatus, status failingCondition.Message = failingMessage } if failure != nil && - skipFailure && - (progressReason == "ClusterOperatorUpdating" || progressReason == "ClusterOperatorsUpdating") && - strings.HasPrefix(progressMessage, slowCOUpdatePrefix) { - progressMessage = strings.TrimPrefix(progressMessage, slowCOUpdatePrefix) + strings.HasPrefix(progressReason, slowCOUpdatePrefix) { failingCondition.Status = configv1.ConditionUnknown failingCondition.Reason = "SlowClusterOperator" failingCondition.Message = progressMessage } + progressReason = strings.TrimPrefix(progressReason, slowCOUpdatePrefix) + resourcemerge.SetOperatorStatusCondition(&cvStatus.Conditions, failingCondition) // update progressing @@ -583,13 +582,13 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin } // returns true in those slow cases because it is still only a suspicion if len(exceeded) > 0 && !machineConfig { - return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true } if len(exceeded) > 0 && machineConfig { - return uErr.Reason, fmt.Sprintf("%swaiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix, strings.Join(exceeded, ", ")), true + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true } if len(exceeded) == 0 && machineConfig { - return uErr.Reason, fmt.Sprintf("%swaiting on machine-config over 90 minutes which is longer than expected", slowCOUpdatePrefix), true + return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true } return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true case payload.UpdateEffectFail: From f1950b879e29d03b2af3ba1f23f41ab918419818 Mon Sep 17 00:00:00 2001 From: Hongkai Liu Date: Wed, 26 Mar 2025 09:29:07 -0400 Subject: [PATCH 3/3] Extract two cases from convertErrorToProgressing into its own function --- pkg/cvo/status.go | 98 +++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index 3f2de898f..db0c44845 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -559,59 +559,67 @@ func convertErrorToProgressing(now time.Time, statusFailure error) (reason strin case payload.UpdateEffectReport: return uErr.Reason, uErr.Error(), false case payload.UpdateEffectNone: - var exceeded []string - names := uErr.Names - if len(names) == 0 { - names = []string{uErr.Name} - } - var machineConfig bool - for _, name := range names { - m := 30 * time.Minute - // It takes longer to upgrade MCO - if name == "machine-config" { - m = 3 * m - } - t := payload.COUpdateStartTimesGet(name) - if (!t.IsZero()) && t.Before(now.Add(-(m))) { - if name == "machine-config" { - machineConfig = true - } else { - exceeded = append(exceeded, name) - } - } - } - // returns true in those slow cases because it is still only a suspicion - if len(exceeded) > 0 && !machineConfig { - return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true - } - if len(exceeded) > 0 && machineConfig { - return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true - } - if len(exceeded) == 0 && machineConfig { - return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true - } - return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true + return convertErrorToProgressingForUpdateEffectNone(uErr, now) case payload.UpdateEffectFail: return "", "", false case payload.UpdateEffectFailAfterInterval: - var exceeded []string - threshold := now.Add(-(40 * time.Minute)) - names := uErr.Names - if len(names) == 0 { - names = []string{uErr.Name} - } - for _, name := range names { - if payload.COUpdateStartTimesGet(name).Before(threshold) { + return convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr, now) + } + return "", "", false +} + +func convertErrorToProgressingForUpdateEffectNone(uErr *payload.UpdateError, now time.Time) (string, string, bool) { + var exceeded []string + names := uErr.Names + if len(names) == 0 { + names = []string{uErr.Name} + } + var machineConfig bool + for _, name := range names { + m := 30 * time.Minute + // It takes longer to upgrade MCO + if name == "machine-config" { + m = 3 * m + } + t := payload.COUpdateStartTimesGet(name) + if (!t.IsZero()) && t.Before(now.Add(-(m))) { + if name == "machine-config" { + machineConfig = true + } else { exceeded = append(exceeded, name) } } - if len(exceeded) > 0 { - return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false - } else { - return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true + } + // returns true in those slow cases because it is still only a suspicion + if len(exceeded) > 0 && !machineConfig { + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes which is longer than expected", strings.Join(exceeded, ", ")), true + } + if len(exceeded) > 0 && machineConfig { + return slowCOUpdatePrefix + uErr.Reason, fmt.Sprintf("waiting on %s over 30 minutes and machine-config over 90 minutes which is longer than expected", strings.Join(exceeded, ", ")), true + } + if len(exceeded) == 0 && machineConfig { + return slowCOUpdatePrefix + uErr.Reason, "waiting on machine-config over 90 minutes which is longer than expected", true + } + return uErr.Reason, fmt.Sprintf("waiting on %s", strings.Join(names, ", ")), true +} + +func convertErrorToProgressingForUpdateEffectFailAfterInterval(uErr *payload.UpdateError, now time.Time) (string, string, bool) { + var exceeded []string + threshold := now.Add(-(40 * time.Minute)) + names := uErr.Names + if len(names) == 0 { + names = []string{uErr.Name} + } + for _, name := range names { + if payload.COUpdateStartTimesGet(name).Before(threshold) { + exceeded = append(exceeded, name) } } - return "", "", false + if len(exceeded) > 0 { + return uErr.Reason, fmt.Sprintf("wait has exceeded 40 minutes for these operators: %s", strings.Join(exceeded, ", ")), false + } else { + return uErr.Reason, fmt.Sprintf("waiting up to 40 minutes on %s", uErr.Name), true + } } // syncFailingStatus handles generic errors in the cluster version. It tries to preserve