From c059e51008cac4c0307f24952b361bced7da2e3c Mon Sep 17 00:00:00 2001 From: Matheus Pimenta Date: Sat, 9 Aug 2025 18:36:43 +0100 Subject: [PATCH 1/2] Remove tautology Signed-off-by: Matheus Pimenta --- internal/reconcile/atomic_release.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/internal/reconcile/atomic_release.go b/internal/reconcile/atomic_release.go index 457a28a30..07eacf2c7 100644 --- a/internal/reconcile/atomic_release.go +++ b/internal/reconcile/atomic_release.go @@ -283,16 +283,11 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, "%s", conditions.GetMessage(req.Object, meta.ReadyCondition)) return ErrMustRequeue } - // Check if retries have exhausted after remediation for early - // stall condition detection. - if remediation != nil && remediation.RetriesExhausted(req.Object) { - conditions.MarkStalled(req.Object, "RetriesExceeded", "Failed to %s after %d attempt(s)", - req.Object.Status.LastAttemptedReleaseAction, req.Object.GetActiveRemediation().GetFailureCount(req.Object)) - return ErrExceededMaxRetries - } - conditions.Delete(req.Object, meta.ReconcilingCondition) - return nil + // Retries have exhausted after remediation for early stall condition detection. + conditions.MarkStalled(req.Object, "RetriesExceeded", "Failed to %s after %d attempt(s)", + req.Object.Status.LastAttemptedReleaseAction, req.Object.GetActiveRemediation().GetFailureCount(req.Object)) + return ErrExceededMaxRetries } // Append the type to the set of action types we have performed. From 0325b55b05c0756c2f7805eebb9a25f1b2ce6dba Mon Sep 17 00:00:00 2001 From: Matheus Pimenta Date: Fri, 8 Aug 2025 11:49:58 +0100 Subject: [PATCH 2/2] Introduce `RetryOnFailure` lifecycle management strategy Signed-off-by: Matheus Pimenta --- api/v2/helmrelease_types.go | 123 ++++++++- api/v2/zz_generated.deepcopy.go | 50 ++++ .../helm.toolkit.fluxcd.io_helmreleases.yaml | 51 +++- docs/api/v2/helm.md | 140 ++++++++++- docs/spec/v2/helmreleases.md | 49 ++++ internal/controller/helmrelease_controller.go | 8 +- .../controller/helmrelease_controller_test.go | 86 +++++++ internal/reconcile/atomic_release.go | 34 +++ internal/reconcile/atomic_release_test.go | 233 ++++++++++++++++++ internal/reconcile/install.go | 6 + internal/reconcile/install_test.go | 26 ++ internal/reconcile/upgrade.go | 6 + internal/reconcile/upgrade_test.go | 26 ++ 13 files changed, 832 insertions(+), 6 deletions(-) diff --git a/api/v2/helmrelease_types.go b/api/v2/helmrelease_types.go index ed34e5254..4e3efb7b5 100644 --- a/api/v2/helmrelease_types.go +++ b/api/v2/helmrelease_types.go @@ -437,6 +437,20 @@ type Remediation interface { RetriesExhausted(hr *HelmRelease) bool } +// Strategy defines a consistent interface for InstallStrategy and +// UpgradeStrategy. +// +kubebuilder:object:generate=false +type Strategy interface { + GetRetry() Retry +} + +// Retry defines a consistent interface for retry strategies from +// InstallStrategy and UpgradeStrategy. +// +kubebuilder:object:generate=false +type Retry interface { + GetRetryInterval() time.Duration +} + // Install holds the configuration for Helm install actions performed for this // HelmRelease. type Install struct { @@ -448,6 +462,11 @@ type Install struct { // +optional Timeout *metav1.Duration `json:"timeout,omitempty"` + // Strategy defines the install strategy to use for this HelmRelease. + // Defaults to 'RemediateOnFailure'. + // +optional + Strategy *InstallStrategy `json:"strategy,omitempty"` + // Remediation holds the remediation configuration for when the Helm install // action for the HelmRelease fails. The default is to not perform any action. // +optional @@ -541,6 +560,41 @@ func (in Install) GetRemediation() Remediation { return *in.Remediation } +// GetRetry returns the configured retry strategy for the Helm install +// action. +func (in Install) GetRetry() Retry { + if in.Strategy == nil || in.Strategy.Name != string(ActionStrategyRetryOnFailure) { + return nil + } + return in.Strategy +} + +// InstallStrategy holds the configuration for Helm install strategy. +// +kubebuilder:validation:XValidation:rule="!has(self.retryInterval) || self.name != 'RemediateOnFailure'", message=".retryInterval cannot be set when .name is 'RemediateOnFailure'" +type InstallStrategy struct { + // Name of the install strategy. + // +kubebuilder:validation:Enum=RemediateOnFailure;RetryOnFailure + // +required + Name string `json:"name"` + + // RetryInterval is the interval at which to retry a failed install. + // Can be used only when Name is set to RetryOnFailure. + // Defaults to '5m'. + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Pattern="^([0-9]+(\\.[0-9]+)?(ms|s|m|h))+$" + // +optional + RetryInterval *metav1.Duration `json:"retryInterval,omitempty"` +} + +// GetRetryInterval returns the configured retry interval for the Helm install +// action, or the default. +func (in InstallStrategy) GetRetryInterval() time.Duration { + if in.RetryInterval == nil { + return 5 * time.Minute + } + return in.RetryInterval.Duration +} + // InstallRemediation holds the configuration for Helm install remediation. type InstallRemediation struct { // Retries is the number of retries that should be attempted on failures before @@ -631,6 +685,11 @@ type Upgrade struct { // +optional Timeout *metav1.Duration `json:"timeout,omitempty"` + // Strategy defines the upgrade strategy to use for this HelmRelease. + // Defaults to 'RemediateOnFailure'. + // +optional + Strategy *UpgradeStrategy `json:"strategy,omitempty"` + // Remediation holds the remediation configuration for when the Helm upgrade // action for the HelmRelease fails. The default is to not perform any action. // +optional @@ -719,6 +778,41 @@ func (in Upgrade) GetRemediation() Remediation { return *in.Remediation } +// GetRetry returns the configured retry strategy for the Helm upgrade +// action. +func (in Upgrade) GetRetry() Retry { + if in.Strategy == nil || in.Strategy.Name != string(ActionStrategyRetryOnFailure) { + return nil + } + return in.Strategy +} + +// UpgradeStrategy holds the configuration for Helm upgrade strategy. +// +kubebuilder:validation:XValidation:rule="!has(self.retryInterval) || self.name == 'RetryOnFailure'", message=".retryInterval can only be set when .name is 'RetryOnFailure'" +type UpgradeStrategy struct { + // Name of the upgrade strategy. + // +kubebuilder:validation:Enum=RemediateOnFailure;RetryOnFailure + // +required + Name string `json:"name"` + + // RetryInterval is the interval at which to retry a failed upgrade. + // Can be used only when Name is set to RetryOnFailure. + // Defaults to '5m'. + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Pattern="^([0-9]+(\\.[0-9]+)?(ms|s|m|h))+$" + // +optional + RetryInterval *metav1.Duration `json:"retryInterval,omitempty"` +} + +// GetRetryInterval returns the configured retry interval for the Helm upgrade +// action, or the default. +func (in UpgradeStrategy) GetRetryInterval() time.Duration { + if in.RetryInterval == nil { + return 5 * time.Minute + } + return in.RetryInterval.Duration +} + // UpgradeRemediation holds the configuration for Helm upgrade remediation. type UpgradeRemediation struct { // Retries is the number of retries that should be attempted on failures before @@ -791,6 +885,19 @@ func (in UpgradeRemediation) RetriesExhausted(hr *HelmRelease) bool { return in.Retries >= 0 && in.GetFailureCount(hr) > int64(in.Retries) } +// ActionStrategyName is a valid name for an action strategy. +type ActionStrategyName string + +const ( + // ActionStrategyRemediateOnFailure is the action strategy name for + // remediate on failure. + ActionStrategyRemediateOnFailure ActionStrategyName = "RemediateOnFailure" + + // ActionStrategyRetryOnFailure is the action strategy name for retry on + // failure. + ActionStrategyRetryOnFailure ActionStrategyName = "RetryOnFailure" +) + // RemediationStrategy returns the strategy to use to remediate a failed install // or upgrade. type RemediationStrategy string @@ -1012,7 +1119,8 @@ type HelmReleaseStatus struct { History Snapshots `json:"history,omitempty"` // LastAttemptedReleaseAction is the last release action performed for this - // HelmRelease. It is used to determine the active remediation strategy. + // HelmRelease. It is used to determine the active retry or remediation + // strategy. // +kubebuilder:validation:Enum=install;upgrade // +optional LastAttemptedReleaseAction ReleaseAction `json:"lastAttemptedReleaseAction,omitempty"` @@ -1195,6 +1303,19 @@ func (in HelmRelease) GetActiveRemediation() Remediation { } } +// GetActiveRetry returns the active retry configuration for the +// HelmRelease. +func (in HelmRelease) GetActiveRetry() Retry { + switch in.Status.LastAttemptedReleaseAction { + case ReleaseActionInstall: + return in.GetInstall().GetRetry() + case ReleaseActionUpgrade: + return in.GetUpgrade().GetRetry() + default: + return nil + } +} + // GetRequeueAfter returns the duration after which the HelmRelease // must be reconciled again. func (in HelmRelease) GetRequeueAfter() time.Duration { diff --git a/api/v2/zz_generated.deepcopy.go b/api/v2/zz_generated.deepcopy.go index c9f2b5076..57d818d3a 100644 --- a/api/v2/zz_generated.deepcopy.go +++ b/api/v2/zz_generated.deepcopy.go @@ -475,6 +475,11 @@ func (in *Install) DeepCopyInto(out *Install) { *out = new(v1.Duration) **out = **in } + if in.Strategy != nil { + in, out := &in.Strategy, &out.Strategy + *out = new(InstallStrategy) + (*in).DeepCopyInto(*out) + } if in.Remediation != nil { in, out := &in.Remediation, &out.Remediation *out = new(InstallRemediation) @@ -517,6 +522,26 @@ func (in *InstallRemediation) DeepCopy() *InstallRemediation { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InstallStrategy) DeepCopyInto(out *InstallStrategy) { + *out = *in + if in.RetryInterval != nil { + in, out := &in.RetryInterval, &out.RetryInterval + *out = new(v1.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InstallStrategy. +func (in *InstallStrategy) DeepCopy() *InstallStrategy { + if in == nil { + return nil + } + out := new(InstallStrategy) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Kustomize) DeepCopyInto(out *Kustomize) { *out = *in @@ -726,6 +751,11 @@ func (in *Upgrade) DeepCopyInto(out *Upgrade) { *out = new(v1.Duration) **out = **in } + if in.Strategy != nil { + in, out := &in.Strategy, &out.Strategy + *out = new(UpgradeStrategy) + (*in).DeepCopyInto(*out) + } if in.Remediation != nil { in, out := &in.Remediation, &out.Remediation *out = new(UpgradeRemediation) @@ -772,3 +802,23 @@ func (in *UpgradeRemediation) DeepCopy() *UpgradeRemediation { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *UpgradeStrategy) DeepCopyInto(out *UpgradeStrategy) { + *out = *in + if in.RetryInterval != nil { + in, out := &in.RetryInterval, &out.RetryInterval + *out = new(v1.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new UpgradeStrategy. +func (in *UpgradeStrategy) DeepCopy() *UpgradeStrategy { + if in == nil { + return nil + } + out := new(UpgradeStrategy) + in.DeepCopyInto(out) + return out +} diff --git a/config/crd/bases/helm.toolkit.fluxcd.io_helmreleases.yaml b/config/crd/bases/helm.toolkit.fluxcd.io_helmreleases.yaml index 5c04b97b9..91a406347 100644 --- a/config/crd/bases/helm.toolkit.fluxcd.io_helmreleases.yaml +++ b/config/crd/bases/helm.toolkit.fluxcd.io_helmreleases.yaml @@ -445,6 +445,30 @@ spec: Deprecated use CRD policy (`crds`) attribute with value `Skip` instead. type: boolean + strategy: + description: |- + Strategy defines the install strategy to use for this HelmRelease. + Defaults to 'RemediateOnFailure'. + properties: + name: + description: Name of the install strategy. + enum: + - RemediateOnFailure + - RetryOnFailure + type: string + retryInterval: + description: |- + RetryInterval is the interval at which to retry a failed install. + Can be used only when Name is set to RetryOnFailure. + Defaults to '5m'. + pattern: ^([0-9]+(\.[0-9]+)?(ms|s|m|h))+$ + type: string + required: + - name + type: object + x-kubernetes-validations: + - message: .retryInterval cannot be set when .name is 'RemediateOnFailure' + rule: '!has(self.retryInterval) || self.name != ''RemediateOnFailure''' timeout: description: |- Timeout is the time to wait for any individual Kubernetes operation (like @@ -912,6 +936,30 @@ spec: - uninstall type: string type: object + strategy: + description: |- + Strategy defines the upgrade strategy to use for this HelmRelease. + Defaults to 'RemediateOnFailure'. + properties: + name: + description: Name of the upgrade strategy. + enum: + - RemediateOnFailure + - RetryOnFailure + type: string + retryInterval: + description: |- + RetryInterval is the interval at which to retry a failed upgrade. + Can be used only when Name is set to RetryOnFailure. + Defaults to '5m'. + pattern: ^([0-9]+(\.[0-9]+)?(ms|s|m|h))+$ + type: string + required: + - name + type: object + x-kubernetes-validations: + - message: .retryInterval can only be set when .name is 'RetryOnFailure' + rule: '!has(self.retryInterval) || self.name == ''RetryOnFailure''' timeout: description: |- Timeout is the time to wait for any individual Kubernetes operation (like @@ -1178,7 +1226,8 @@ spec: lastAttemptedReleaseAction: description: |- LastAttemptedReleaseAction is the last release action performed for this - HelmRelease. It is used to determine the active remediation strategy. + HelmRelease. It is used to determine the active retry or remediation + strategy. enum: - install - upgrade diff --git a/docs/api/v2/helm.md b/docs/api/v2/helm.md index 7546dc581..1c2abf3c8 100644 --- a/docs/api/v2/helm.md +++ b/docs/api/v2/helm.md @@ -424,6 +424,9 @@ HelmReleaseStatus +

ActionStrategyName +(string alias)

+

ActionStrategyName is a valid name for an action strategy.

CRDsPolicy (string alias)

@@ -1676,7 +1679,8 @@ ReleaseAction (Optional)

LastAttemptedReleaseAction is the last release action performed for this -HelmRelease. It is used to determine the active remediation strategy.

+HelmRelease. It is used to determine the active retry or remediation +strategy.

@@ -1934,6 +1938,21 @@ Jobs for hooks) during the performance of a Helm install action. Defaults to +strategy
+ + +InstallStrategy + + + + +(Optional) +

Strategy defines the install strategy to use for this HelmRelease. +Defaults to ‘RemediateOnFailure’.

+ + + + remediation
@@ -2156,6 +2175,54 @@ no retries remain. Defaults to ‘false’.

+

InstallStrategy +

+

+(Appears on: +Install) +

+

InstallStrategy holds the configuration for Helm install strategy.

+
+
+ + + + + + + + + + + + + + + + + +
FieldDescription
+name
+ +string + +
+

Name of the install strategy.

+
+retryInterval
+ + +Kubernetes meta/v1.Duration + + +
+(Optional) +

RetryInterval is the interval at which to retry a failed install. +Can be used only when Name is set to RetryOnFailure. +Defaults to ‘5m’.

+
+
+

Kustomize

@@ -2262,6 +2329,10 @@ UpgradeRemediation.

RemediationStrategy returns the strategy to use to remediate a failed install or upgrade.

+

Retry +

+

Retry defines a consistent interface for retry strategies from +InstallStrategy and UpgradeStrategy.

Rollback

@@ -2585,6 +2656,10 @@ string HelmReleaseStatus)

Snapshots is a list of Snapshot objects.

+

Strategy +

+

Strategy defines a consistent interface for InstallStrategy and +UpgradeStrategy.

Test

@@ -2848,6 +2923,21 @@ Jobs for hooks) during the performance of a Helm upgrade action. Defaults to +strategy
+ + +UpgradeStrategy + + + + +(Optional) +

Strategy defines the upgrade strategy to use for this HelmRelease. +Defaults to ‘RemediateOnFailure’.

+ + + + remediation
@@ -3081,6 +3171,54 @@ RemediationStrategy +

UpgradeStrategy +

+

+(Appears on: +Upgrade) +

+

UpgradeStrategy holds the configuration for Helm upgrade strategy.

+
+
+ + + + + + + + + + + + + + + + + +
FieldDescription
+name
+ +string + +
+

Name of the upgrade strategy.

+
+retryInterval
+ + +Kubernetes meta/v1.Duration + + +
+(Optional) +

RetryInterval is the interval at which to retry a failed upgrade. +Can be used only when Name is set to RetryOnFailure. +Defaults to ‘5m’.

+
+
+

This page was automatically generated with gen-crd-api-reference-docs

diff --git a/docs/spec/v2/helmreleases.md b/docs/spec/v2/helmreleases.md index 4f3f769de..8e2093dbe 100644 --- a/docs/spec/v2/helmreleases.md +++ b/docs/spec/v2/helmreleases.md @@ -555,6 +555,31 @@ The field offers the following subfields: - `.disableWaitForJobs` (Optional): Disables waiting for any Jobs to complete after the installation of the chart. Defaults to `false`. +#### Install strategy + +`.spec.install.strategy` is an optional field to specify the strategy +to use when running a Helm install action. + +The field offers the following subfields: + +- `.name` (Required): The name of the install strategy to use. One of + `RemediateOnFailure` or `RetryOnFailure`. + If the `.spec.install.strategy` field is not specified, the HelmRelease + reconciliation behaves as if `.spec.install.strategy.name` was set to + `RemediateOnFailure`. +- `.retryInterval` (Optional): The time to wait between retries of failed + releases when the install strategy is set to `RetryOnFailure`. Defaults + to `5m`. Cannot be used with `RemediateOnFailure`. + +The default `RemediateOnFailure` strategy applies the rules defined by the +`.spec.install.remediation` field to the install action, i.e. the same +behavior of the controller prior to the introduction of the `RetryOnFailure` +strategy. + +The `RetryOnFailure` strategy will retry a failed install with an upgrade +after the interval defined by the `.spec.install.strategy.retryInterval` +field. + #### Install remediation `.spec.install.remediation` is an optional field to configure the remediation @@ -606,6 +631,30 @@ The field offers the following subfields: last release while merging in overrides from [values](#values). Setting this flag makes the HelmRelease non-declarative. Defaults to `false`. +#### Upgrade strategy + +`.spec.upgrade.strategy` is an optional field to specify the strategy +to use when running a Helm upgrade action. + +The field offers the following subfields: + +- `.name` (Required): The name of the upgrade strategy to use. One of + `RemediateOnFailure` or `RetryOnFailure`. If the `.spec.upgrade.strategy` + field is not specified, the HelmRelease reconciliation behaves as if + `.spec.upgrade.strategy.name` was set to `RemediateOnFailure`. +- `.retryInterval` (Optional): The time to wait between retries of failed + releases when the upgrade strategy is set to `RetryOnFailure`. Defaults + to `5m`. Cannot be used with `RemediateOnFailure`. + +The default `RemediateOnFailure` strategy applies the rules defined by the +`.spec.upgrade.remediation` field to the upgrade action, i.e. the same +behavior of the controller prior to the introduction of the `RetryOnFailure` +strategy. + +The `RetryOnFailure` strategy will retry failed upgrades in a regular +interval defined by the `.spec.upgrade.strategy.retryInterval` field, +without applying any remediation. + #### Upgrade remediation `.spec.upgrade.remediation` is an optional field to configure the remediation diff --git a/internal/controller/helmrelease_controller.go b/internal/controller/helmrelease_controller.go index 519038ebb..e29739706 100644 --- a/internal/controller/helmrelease_controller.go +++ b/internal/controller/helmrelease_controller.go @@ -392,10 +392,12 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe Chart: loadedChart, Values: values, }); err != nil { - if errors.Is(err, intreconcile.ErrMustRequeue) { + switch { + case errors.Is(err, intreconcile.ErrRetryAfterInterval): + return jitter.JitteredRequeueInterval(ctrl.Result{RequeueAfter: obj.GetActiveRetry().GetRetryInterval()}), nil + case errors.Is(err, intreconcile.ErrMustRequeue): return ctrl.Result{Requeue: true}, nil - } - if interrors.IsOneOf(err, intreconcile.ErrExceededMaxRetries, intreconcile.ErrMissingRollbackTarget) { + case interrors.IsOneOf(err, intreconcile.ErrExceededMaxRetries, intreconcile.ErrMissingRollbackTarget): err = reconcile.TerminalError(err) } return ctrl.Result{}, err diff --git a/internal/controller/helmrelease_controller_test.go b/internal/controller/helmrelease_controller_test.go index 8d24da9f1..b571e74ff 100644 --- a/internal/controller/helmrelease_controller_test.go +++ b/internal/controller/helmrelease_controller_test.go @@ -663,6 +663,92 @@ func TestHelmReleaseReconciler_reconcileRelease(t *testing.T) { g.Expect(obj.Status.Failures).To(Equal(int64(1))) }) + t.Run("uses retry interval when the error ErrRetryAfterInterval is returned", func(t *testing.T) { + g := NewWithT(t) + + chartMock := testutil.BuildChart() + chartArtifact, err := testutil.SaveChartAsArtifact(chartMock, digest.SHA256, testServer.URL(), testServer.Root()) + g.Expect(err).ToNot(HaveOccurred()) + + chart := &sourcev1.HelmChart{ + ObjectMeta: metav1.ObjectMeta{ + Name: "chart", + Namespace: "mock", + Generation: 1, + }, + Status: sourcev1.HelmChartStatus{ + ObservedGeneration: 1, + Artifact: chartArtifact, + Conditions: []metav1.Condition{ + { + Type: meta.ReadyCondition, + Status: metav1.ConditionTrue, + }, + }, + }, + } + + obj := &v2.HelmRelease{ + ObjectMeta: metav1.ObjectMeta{ + Name: "release", + Namespace: "mock", + Generation: 2, + }, + Spec: v2.HelmReleaseSpec{ + // Trigger a failure by setting an invalid storage namespace, + // preventing the release from actually being installed. + // This allows us to just test the failure count reset, without + // having to facilitate a full install. + StorageNamespace: "not-exist", + Install: &v2.Install{ + Strategy: &v2.InstallStrategy{ + Name: "RetryOnFailure", + RetryInterval: &metav1.Duration{Duration: time.Minute}, + }, + }, + }, + Status: v2.HelmReleaseStatus{ + HelmChart: "mock/chart", + InstallFailures: 2, + UpgradeFailures: 3, + Failures: 5, + LastAttemptedGeneration: 2, + }, + } + + c := fake.NewClientBuilder(). + WithScheme(NewTestScheme()). + WithStatusSubresource(&v2.HelmRelease{}). + WithObjects(chart, obj). + Build() + + r := &HelmReleaseReconciler{ + Client: c, + APIReader: c, + GetClusterConfig: GetTestClusterConfig, + EventRecorder: record.NewFakeRecorder(32), + } + + res, err := r.reconcileRelease(context.TODO(), patch.NewSerialPatcher(obj, c), obj) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res.RequeueAfter).To(BeNumerically("==", time.Minute)) + + g.Expect(obj.Status.Conditions).To(conditions.MatchConditions([]metav1.Condition{ + { + Type: "Ready", + Status: "False", + Reason: "RetryAfterInterval", + Message: "Will retry after 1m0s", + }, + { + Type: "Released", + Status: "False", + Reason: "InstallFailed", + Message: "Helm install failed for release mock/release with chart hello@0.1.0: create: failed to create: namespaces \"not-exist\" not found", + }, + })) + }) + t.Run("sets last attempted values", func(t *testing.T) { g := NewWithT(t) diff --git a/internal/reconcile/atomic_release.go b/internal/reconcile/atomic_release.go index 07eacf2c7..1399c4786 100644 --- a/internal/reconcile/atomic_release.go +++ b/internal/reconcile/atomic_release.go @@ -58,6 +58,11 @@ var ( // attempts for the provided release config. ErrExceededMaxRetries = errors.New("exceeded maximum retries") + // ErrRetryAfterInterval is returned when the action strategy is RetryOnFailure + // and the current AtomicRelease has already reconciled at least one action, + // in which case the action must be retried after the configured retry interval. + ErrRetryAfterInterval = errors.New("retry after interval") + // ErrMustRequeue is returned when the caller must requeue the object // to continue the reconciliation process. ErrMustRequeue = errors.New("must requeue") @@ -235,6 +240,13 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { fmt.Sprintf("instructed to stop before running %s action reconciler %s", next.Type(), next.Name()), ) + if retry := req.Object.GetActiveRetry(); retry != nil { + conditions.Delete(req.Object, meta.ReconcilingCondition) + conditions.MarkFalse(req.Object, meta.ReadyCondition, "RetryAfterInterval", + "Will retry after %s", retry.GetRetryInterval().String()) + return ErrRetryAfterInterval + } + if remediation := req.Object.GetActiveRemediation(); remediation == nil || !remediation.RetriesExhausted(req.Object) { conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, "%s", conditions.GetMessage(req.Object, meta.ReadyCondition)) return ErrMustRequeue @@ -266,6 +278,14 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { // Run the action sub-reconciler. log.Info(fmt.Sprintf("running '%s' action with timeout of %s", next.Name(), timeoutForAction(next, req.Object).String())) if err = next.Reconcile(ctx, req); err != nil { + if retry := req.Object.GetActiveRetry(); retry != nil { + log.Error(err, fmt.Sprintf("failed to run '%s' action", next.Name())) + conditions.Delete(req.Object, meta.ReconcilingCondition) + conditions.MarkFalse(req.Object, meta.ReadyCondition, "RetryAfterInterval", + "Will retry after %s", retry.GetRetryInterval().String()) + return ErrRetryAfterInterval + } + if conditions.IsReady(req.Object) { conditions.MarkFalse(req.Object, meta.ReadyCondition, "ReconcileError", "%s", err) } @@ -278,6 +298,13 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { "instructed to stop after running %s action reconciler %s", next.Type(), next.Name()), ) + if retry := req.Object.GetActiveRetry(); retry != nil { + conditions.Delete(req.Object, meta.ReconcilingCondition) + conditions.MarkFalse(req.Object, meta.ReadyCondition, "RetryAfterInterval", + "Will retry after %s", retry.GetRetryInterval().String()) + return ErrRetryAfterInterval + } + remediation := req.Object.GetActiveRemediation() if remediation == nil || !remediation.RetriesExhausted(req.Object) { conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, "%s", conditions.GetMessage(req.Object, meta.ReadyCondition)) @@ -429,6 +456,13 @@ func (r *AtomicRelease) actionForState(ctx context.Context, req *Request, state case ReleaseStatusFailed: log.Info(msgWithReason("release is in a failed state", state.Reason)) + // If the action strategy is to retry (and not remediate), we behave just like + // "flux reconcile hr --force" and .spec..remediation.retries set to 0. + if req.Object.GetActiveRetry() != nil { + log.V(logger.DebugLevel).Info("retrying upgrade for failed release") + return NewUpgrade(r.configFactory, r.eventRecorder), nil + } + remediation := req.Object.GetActiveRemediation() // If there is no active remediation strategy, we can only attempt to diff --git a/internal/reconcile/atomic_release_test.go b/internal/reconcile/atomic_release_test.go index fafee08ff..544cf8dac 100644 --- a/internal/reconcile/atomic_release_test.go +++ b/internal/reconcile/atomic_release_test.go @@ -633,6 +633,27 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { }, wantErr: ErrExceededMaxRetries, }, + { + name: "install failure with retry", + spec: func(spec *v2.HelmReleaseSpec) { + spec.Install = &v2.Install{ + Strategy: &v2.InstallStrategy{ + Name: "RetryOnFailure", + RetryInterval: &metav1.Duration{Duration: time.Minute}, + }, + } + spec.Uninstall = &v2.Uninstall{ + KeepHistory: true, + } + }, + chart: testutil.BuildChart(testutil.ChartWithFailingHook()), + expectHistory: func(releases []*helmrelease.Release) v2.Snapshots { + return v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), + } + }, + wantErr: ErrRetryAfterInterval, + }, { name: "install test failure with remediation", spec: func(spec *v2.HelmReleaseSpec) { @@ -659,6 +680,33 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { }, wantErr: ErrExceededMaxRetries, }, + { + name: "install test failure with retry", + spec: func(spec *v2.HelmReleaseSpec) { + spec.Install = &v2.Install{ + Strategy: &v2.InstallStrategy{ + Name: "RetryOnFailure", + RetryInterval: &metav1.Duration{Duration: time.Minute}, + }, + } + spec.Test = &v2.Test{ + Enable: true, + } + spec.Uninstall = &v2.Uninstall{ + KeepHistory: true, + } + }, + chart: testutil.BuildChart(testutil.ChartWithFailingTestHook()), + expectHistory: func(releases []*helmrelease.Release) v2.Snapshots { + snap := release.ObservedToSnapshot(release.ObserveRelease(releases[0])) + snap.SetTestHooks(release.TestHooksFromRelease(releases[0])) + + return v2.Snapshots{ + snap, + } + }, + wantErr: ErrRetryAfterInterval, + }, { name: "install test failure with test ignore", spec: func(spec *v2.HelmReleaseSpec) { @@ -806,6 +854,43 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { }, wantErr: ErrExceededMaxRetries, }, + { + name: "upgrade failure with retry", + releases: func(namespace string) []*helmrelease.Release { + return []*helmrelease.Release{ + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: namespace, + Version: 1, + Chart: testutil.BuildChart(), + Status: helmrelease.StatusDeployed, + }), + } + }, + spec: func(spec *v2.HelmReleaseSpec) { + spec.Upgrade = &v2.Upgrade{ + Strategy: &v2.UpgradeStrategy{ + Name: "RetryOnFailure", + RetryInterval: &metav1.Duration{Duration: time.Minute}, + }, + } + }, + status: func(namespace string, releases []*helmrelease.Release) v2.HelmReleaseStatus { + return v2.HelmReleaseStatus{ + History: v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), + }, + } + }, + chart: testutil.BuildChart(testutil.ChartWithFailingHook()), + expectHistory: func(releases []*helmrelease.Release) v2.Snapshots { + return v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(releases[1])), + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), + } + }, + wantErr: ErrRetryAfterInterval, + }, { name: "upgrade test failure with remediation", releases: func(namespace string) []*helmrelease.Release { @@ -849,6 +934,49 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { }, wantErr: ErrExceededMaxRetries, }, + { + name: "upgrade test failure with retry", + releases: func(namespace string) []*helmrelease.Release { + return []*helmrelease.Release{ + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: namespace, + Version: 1, + Chart: testutil.BuildChart(), + Status: helmrelease.StatusDeployed, + }), + } + }, + spec: func(spec *v2.HelmReleaseSpec) { + spec.Upgrade = &v2.Upgrade{ + Strategy: &v2.UpgradeStrategy{ + Name: "RetryOnFailure", + RetryInterval: &metav1.Duration{Duration: time.Minute}, + }, + } + spec.Test = &v2.Test{ + Enable: true, + } + }, + status: func(namespace string, releases []*helmrelease.Release) v2.HelmReleaseStatus { + return v2.HelmReleaseStatus{ + History: v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), + }, + } + }, + chart: testutil.BuildChart(testutil.ChartWithFailingTestHook()), + expectHistory: func(releases []*helmrelease.Release) v2.Snapshots { + testedSnap := release.ObservedToSnapshot(release.ObserveRelease(releases[1])) + testedSnap.SetTestHooks(release.TestHooksFromRelease(releases[1])) + + return v2.Snapshots{ + testedSnap, + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), + } + }, + wantErr: ErrRetryAfterInterval, + }, { name: "upgrade test failure with test ignore", releases: func(namespace string) []*helmrelease.Release { @@ -984,6 +1112,55 @@ func TestAtomicRelease_Reconcile_Scenarios(t *testing.T) { chart: testutil.BuildChart(), wantErr: ErrExceededMaxRetries, }, + { + name: "upgrade retry does not result in exhausted retries", + releases: func(namespace string) []*helmrelease.Release { + return []*helmrelease.Release{ + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: namespace, + Version: 1, + Chart: testutil.BuildChart(), + Status: helmrelease.StatusSuperseded, + }), + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: namespace, + Version: 2, + Chart: testutil.BuildChart(), + Status: helmrelease.StatusSuperseded, + }), + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: namespace, + Version: 3, + Chart: testutil.BuildChart(), + Status: helmrelease.StatusFailed, + }), + } + }, + spec: func(spec *v2.HelmReleaseSpec) { + spec.Upgrade = &v2.Upgrade{ + Strategy: &v2.UpgradeStrategy{ + Name: "RetryOnFailure", + RetryInterval: &metav1.Duration{Duration: time.Minute}, + }, + } + }, + status: func(namespace string, releases []*helmrelease.Release) v2.HelmReleaseStatus { + return v2.HelmReleaseStatus{ + History: v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(releases[2])), + release.ObservedToSnapshot(release.ObserveRelease(releases[1])), + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), + }, + LastAttemptedReleaseAction: v2.ReleaseActionUpgrade, + Failures: 2, + UpgradeFailures: 2, + } + }, + chart: testutil.BuildChart(), + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -1632,6 +1809,24 @@ func TestAtomicRelease_actionForState(t *testing.T) { }, want: &UninstallRemediation{}, }, + { + name: "failed release with active install retry triggers upgrade", + state: ReleaseState{Status: ReleaseStatusFailed}, + spec: func(spec *v2.HelmReleaseSpec) { + spec.Install = &v2.Install{ + Strategy: &v2.InstallStrategy{ + Name: "RetryOnFailure", + }, + } + }, + status: func(releases []*helmrelease.Release) v2.HelmReleaseStatus { + return v2.HelmReleaseStatus{ + LastAttemptedReleaseAction: v2.ReleaseActionInstall, + InstallFailures: 2, + } + }, + want: &Upgrade{}, + }, { name: "failed release with active upgrade remediation triggers rollback", state: ReleaseState{Status: ReleaseStatusFailed}, @@ -1670,6 +1865,44 @@ func TestAtomicRelease_actionForState(t *testing.T) { }, want: &RollbackRemediation{}, }, + { + name: "failed release with active upgrade retry triggers upgrade", + state: ReleaseState{Status: ReleaseStatusFailed}, + releases: []*helmrelease.Release{ + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: mockReleaseNamespace, + Version: 1, + Status: helmrelease.StatusSuperseded, + Chart: testutil.BuildChart(), + }), + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: mockReleaseNamespace, + Version: 2, + Status: helmrelease.StatusFailed, + Chart: testutil.BuildChart(), + }), + }, + spec: func(spec *v2.HelmReleaseSpec) { + spec.Upgrade = &v2.Upgrade{ + Strategy: &v2.UpgradeStrategy{ + Name: "RetryOnFailure", + }, + } + }, + status: func(releases []*helmrelease.Release) v2.HelmReleaseStatus { + return v2.HelmReleaseStatus{ + History: v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(releases[1])), + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), + }, + LastAttemptedReleaseAction: v2.ReleaseActionUpgrade, + UpgradeFailures: 1, + } + }, + want: &Upgrade{}, + }, { name: "failed release with active upgrade remediation and no previous release triggers error", state: ReleaseState{Status: ReleaseStatusFailed}, diff --git a/internal/reconcile/install.go b/internal/reconcile/install.go index 1dbfc827a..cbd3845ed 100644 --- a/internal/reconcile/install.go +++ b/internal/reconcile/install.go @@ -185,6 +185,12 @@ func (r *Install) success(req *Request) { cur.FullReleaseName(), cur.VersionedChartName()) } + // Failures are only relevant while the release is failed + // when a retry strategy is configured. + if req.Object.GetInstall().GetRetry() != nil { + req.Object.Status.ClearFailures() + } + // Record event. r.eventRecorder.AnnotatedEventf( req.Object, diff --git a/internal/reconcile/install_test.go b/internal/reconcile/install_test.go index d10962473..5f349b1fb 100644 --- a/internal/reconcile/install_test.go +++ b/internal/reconcile/install_test.go @@ -564,6 +564,32 @@ func TestInstall_success(t *testing.T) { })) }) + t.Run("clears failures if retry strategy is configured", func(t *testing.T) { + g := NewWithT(t) + + recorder := testutil.NewFakeRecorder(10, false) + r := &Install{ + eventRecorder: recorder, + } + + req := &Request{ + Object: obj.DeepCopy(), + } + req.Object.Spec.Install = &v2.Install{ + Strategy: &v2.InstallStrategy{ + Name: "RetryOnFailure", + }, + } + req.Object.Status.Failures = 3 + req.Object.Status.InstallFailures = 3 + req.Object.Status.UpgradeFailures = 3 + r.success(req) + + g.Expect(req.Object.Status.Failures).To(BeZero()) + g.Expect(req.Object.Status.InstallFailures).To(BeZero()) + g.Expect(req.Object.Status.UpgradeFailures).To(BeZero()) + }) + t.Run("records success with TestSuccess=False", func(t *testing.T) { g := NewWithT(t) diff --git a/internal/reconcile/upgrade.go b/internal/reconcile/upgrade.go index 7b29da4a4..ea4f580ef 100644 --- a/internal/reconcile/upgrade.go +++ b/internal/reconcile/upgrade.go @@ -175,6 +175,12 @@ func (r *Upgrade) success(req *Request) { cur.FullReleaseName(), cur.VersionedChartName()) } + // Failures are only relevant while the release is failed + // when a retry strategy is configured. + if req.Object.GetUpgrade().GetRetry() != nil { + req.Object.Status.ClearFailures() + } + // Record event. r.eventRecorder.AnnotatedEventf( req.Object, diff --git a/internal/reconcile/upgrade_test.go b/internal/reconcile/upgrade_test.go index 0133c572f..47ab11f08 100644 --- a/internal/reconcile/upgrade_test.go +++ b/internal/reconcile/upgrade_test.go @@ -724,6 +724,32 @@ func TestUpgrade_success(t *testing.T) { })) }) + t.Run("clears failures if retry strategy is configured", func(t *testing.T) { + g := NewWithT(t) + + recorder := testutil.NewFakeRecorder(10, false) + r := &Upgrade{ + eventRecorder: recorder, + } + + req := &Request{ + Object: obj.DeepCopy(), + } + req.Object.Spec.Upgrade = &v2.Upgrade{ + Strategy: &v2.UpgradeStrategy{ + Name: "RetryOnFailure", + }, + } + req.Object.Status.Failures = 3 + req.Object.Status.InstallFailures = 3 + req.Object.Status.UpgradeFailures = 3 + r.success(req) + + g.Expect(req.Object.Status.Failures).To(BeZero()) + g.Expect(req.Object.Status.InstallFailures).To(BeZero()) + g.Expect(req.Object.Status.UpgradeFailures).To(BeZero()) + }) + t.Run("records success with TestSuccess=False", func(t *testing.T) { g := NewWithT(t)