diff --git a/.changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml b/.changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml
new file mode 100644
index 00000000..d217e400
--- /dev/null
+++ b/.changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml
@@ -0,0 +1,5 @@
+kind: ENHANCEMENTS
+body: Relink to more recent destroy runs when destroying a workspace
+time: 2025-04-24T16:46:45.710081284+02:00
+custom:
+ PR: "594"
diff --git a/.changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml b/.changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml
new file mode 100644
index 00000000..d5202d68
--- /dev/null
+++ b/.changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml
@@ -0,0 +1,5 @@
+kind: ENHANCEMENTS
+body: Workspace CRD option for the operator to retry failed runs
+time: 2025-04-29T15:45:03.031933484+02:00
+custom:
+ PR: "602"
diff --git a/api/v1alpha2/workspace_types.go b/api/v1alpha2/workspace_types.go
index d98d568c..a8ab10d6 100644
--- a/api/v1alpha2/workspace_types.go
+++ b/api/v1alpha2/workspace_types.go
@@ -65,6 +65,17 @@ type RemoteStateSharing struct {
Workspaces []*ConsumerWorkspace `json:"workspaces,omitempty"`
}
+// RetryPolicy allows you to configure retry behavior for failed runs on the workspace.
+// It will apply for the latest current run of the operator.
+type RetryPolicy struct {
+ // Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied.
+ // Default: `0`.
+ //
+ //+kubebuilder:default:=0
+ //+optional
+ BackoffLimit int64 `json:"backoffLimit,omitempty"`
+}
+
// Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle.
// Only one of the fields `ID` or `Name` is allowed.
// At least one of the fields `ID` or `Name` is mandatory.
@@ -592,12 +603,16 @@ type WorkspaceSpec struct {
//
//+optional
RemoteStateSharing *RemoteStateSharing `json:"remoteStateSharing,omitempty"`
+ // Retry Policy allows you to specify how the operator should retry failed runs automatically.
+ //
+ //+optional
+ RetryPolicy *RetryPolicy `json:"retryPolicy,omitempty"`
// Run triggers allow you to connect this workspace to one or more source workspaces.
// These connections allow runs to queue automatically in this workspace on successful apply of runs in any of the source workspaces.
// More information:
// - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/settings/run-triggers
//
- //+kubebuilder:validation:MinItems:=1
+ //+kubebuilder:validation:MinItems:=2
//+optional
RunTriggers []RunTrigger `json:"runTriggers,omitempty"`
// Settings for the workspace's VCS repository, enabling the UI/VCS-driven run workflow.
@@ -742,6 +757,11 @@ type WorkspaceStatus struct {
//
//+optional
VariableSets []VariableSetStatus `json:"variableSet,omitempty"`
+
+ // Retry status of the latest run on the workspace.
+ //
+ //+optional
+ Retry *RetryStatus `json:"retry,omitempty"`
}
type VariableSetStatus struct {
@@ -749,6 +769,13 @@ type VariableSetStatus struct {
Name string `json:"name,omitempty"`
}
+// RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and
+// possibly a time to wait for the next attempt.
+type RetryStatus struct {
+ // Failed is the number of failed attempts, counting the initial one.
+ Failed int64 `json:"failed,omitempty"`
+}
+
//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:printcolumn:name="Workspace ID",type=string,JSONPath=`.status.workspaceID`
diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go
index d8fccce7..706eec2f 100644
--- a/api/v1alpha2/zz_generated.deepcopy.go
+++ b/api/v1alpha2/zz_generated.deepcopy.go
@@ -749,6 +749,36 @@ func (in *RemoteStateSharing) DeepCopy() *RemoteStateSharing {
return out
}
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *RetryPolicy) DeepCopyInto(out *RetryPolicy) {
+ *out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryPolicy.
+func (in *RetryPolicy) DeepCopy() *RetryPolicy {
+ if in == nil {
+ return nil
+ }
+ out := new(RetryPolicy)
+ in.DeepCopyInto(out)
+ return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *RetryStatus) DeepCopyInto(out *RetryStatus) {
+ *out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryStatus.
+func (in *RetryStatus) DeepCopy() *RetryStatus {
+ if in == nil {
+ return nil
+ }
+ out := new(RetryStatus)
+ in.DeepCopyInto(out)
+ return out
+}
+
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RunStatus) DeepCopyInto(out *RunStatus) {
*out = *in
@@ -1114,6 +1144,11 @@ func (in *WorkspaceSpec) DeepCopyInto(out *WorkspaceSpec) {
*out = new(RemoteStateSharing)
(*in).DeepCopyInto(*out)
}
+ if in.RetryPolicy != nil {
+ in, out := &in.RetryPolicy, &out.RetryPolicy
+ *out = new(RetryPolicy)
+ **out = **in
+ }
if in.RunTriggers != nil {
in, out := &in.RunTriggers, &out.RunTriggers
*out = make([]RunTrigger, len(*in))
@@ -1181,6 +1216,11 @@ func (in *WorkspaceStatus) DeepCopyInto(out *WorkspaceStatus) {
*out = make([]VariableSetStatus, len(*in))
copy(*out, *in)
}
+ if in.Retry != nil {
+ in, out := &in.Retry, &out.Retry
+ *out = new(RetryStatus)
+ **out = **in
+ }
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceStatus.
diff --git a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
index a553d237..8738aeac 100644
--- a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
+++ b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
@@ -374,6 +374,18 @@ spec:
minItems: 1
type: array
type: object
+ retryPolicy:
+ description: Retry Policy allows you to specify how the operator should
+ retry failed runs automatically.
+ properties:
+ backoffLimit:
+ default: 0
+ description: |-
+ Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied.
+ Default: `0`.
+ format: int64
+ type: integer
+ type: object
runTasks:
description: |-
Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle.
@@ -442,7 +454,7 @@ spec:
minLength: 1
type: string
type: object
- minItems: 1
+ minItems: 2
type: array
sshKey:
description: |-
@@ -836,6 +848,15 @@ spec:
pattern: ^\d{1}\.\d{1,2}\.\d{1,2}$
type: string
type: object
+ retry:
+ description: Retry status of the latest run on the workspace.
+ properties:
+ failed:
+ description: Failed is the number of failed attempts, counting
+ the initial one.
+ format: int64
+ type: integer
+ type: object
runStatus:
description: Workspace Runs status.
properties:
diff --git a/config/crd/bases/app.terraform.io_workspaces.yaml b/config/crd/bases/app.terraform.io_workspaces.yaml
index 1a8a6774..c39d7cf5 100644
--- a/config/crd/bases/app.terraform.io_workspaces.yaml
+++ b/config/crd/bases/app.terraform.io_workspaces.yaml
@@ -371,6 +371,18 @@ spec:
minItems: 1
type: array
type: object
+ retryPolicy:
+ description: Retry Policy allows you to specify how the operator should
+ retry failed runs automatically.
+ properties:
+ backoffLimit:
+ default: 0
+ description: |-
+ Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied.
+ Default: `0`.
+ format: int64
+ type: integer
+ type: object
runTasks:
description: |-
Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle.
@@ -439,7 +451,7 @@ spec:
minLength: 1
type: string
type: object
- minItems: 1
+ minItems: 2
type: array
sshKey:
description: |-
@@ -833,6 +845,15 @@ spec:
pattern: ^\d{1}\.\d{1,2}\.\d{1,2}$
type: string
type: object
+ retry:
+ description: Retry status of the latest run on the workspace.
+ properties:
+ failed:
+ description: Failed is the number of failed attempts, counting
+ the initial one.
+ format: int64
+ type: integer
+ type: object
runStatus:
description: Workspace Runs status.
properties:
diff --git a/docs/api-reference.md b/docs/api-reference.md
index b9d6bec4..090a36e0 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -549,6 +549,36 @@ _Appears in:_
| `workspaces` _[ConsumerWorkspace](#consumerworkspace) array_ | Allow access to the state for specific workspaces within the same organization. |
+#### RetryPolicy
+
+
+
+RetryPolicy allows you to configure retry behavior for failed runs on the workspace.
+It will apply for the latest current run of the operator.
+
+_Appears in:_
+- [WorkspaceSpec](#workspacespec)
+
+| Field | Description |
+| --- | --- |
+| `backoffLimit` _integer_ | Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied.
Default: `0`. |
+
+
+#### RetryStatus
+
+
+
+RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and
+possibly a time to wait for the next attempt.
+
+_Appears in:_
+- [WorkspaceStatus](#workspacestatus)
+
+| Field | Description |
+| --- | --- |
+| `failed` _integer_ | Failed is the number of failed attempts, counting the initial one. |
+
+
#### RunStatus
@@ -894,6 +924,7 @@ _Appears in:_
| `environmentVariables` _[Variable](#variable) array_ | Terraform Environment variables for all plans and applies in this workspace.
Variables defined within a workspace always overwrite variables from variable sets that have the same type and the same key.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables#environment-variables |
| `terraformVariables` _[Variable](#variable) array_ | Terraform variables for all plans and applies in this workspace.
Variables defined within a workspace always overwrite variables from variable sets that have the same type and the same key.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables#terraform-variables |
| `remoteStateSharing` _[RemoteStateSharing](#remotestatesharing)_ | Remote state access between workspaces.
By default, new workspaces in HCP Terraform do not allow other workspaces to access their state.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/state#accessing-state-from-other-workspaces |
+| `retryPolicy` _[RetryPolicy](#retrypolicy)_ | Retry Policy allows you to specify how the operator should retry failed runs automatically. |
| `runTriggers` _[RunTrigger](#runtrigger) array_ | Run triggers allow you to connect this workspace to one or more source workspaces.
These connections allow runs to queue automatically in this workspace on successful apply of runs in any of the source workspaces.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/settings/run-triggers |
| `versionControl` _[VersionControl](#versioncontrol)_ | Settings for the workspace's VCS repository, enabling the UI/VCS-driven run workflow.
Omit this argument to utilize the CLI-driven and API-driven workflows, where runs are not driven by webhooks on your VCS provider.
More information:
- https://www.terraform.io/cloud-docs/run/ui
- https://www.terraform.io/cloud-docs/vcs |
| `sshKey` _[SSHKey](#sshkey)_ | SSH key used to clone Terraform modules.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/settings/ssh-keys |
diff --git a/internal/controller/agentpool_controller_autoscaling_test.go b/internal/controller/agentpool_controller_autoscaling_test.go
index 87d8666d..314d8798 100644
--- a/internal/controller/agentpool_controller_autoscaling_test.go
+++ b/internal/controller/agentpool_controller_autoscaling_test.go
@@ -92,7 +92,7 @@ var _ = Describe("Agent Pool controller", Ordered, func() {
Expect(err).Should(Succeed())
Expect(ws).ShouldNot(BeNil())
// Create a new Run and execute it
- _ = createAndUploadConfigurationVersion(ws.ID, "hoi")
+ _ = createAndUploadConfigurationVersion(ws.ID, "hoi", true)
Eventually(func() bool {
ws, err = tfClient.Workspaces.ReadByID(ctx, ws.ID)
Expect(err).Should(Succeed())
@@ -147,7 +147,7 @@ var _ = Describe("Agent Pool controller", Ordered, func() {
Expect(err).Should(Succeed())
Expect(ws).ShouldNot(BeNil())
// New Run
- _ = createAndUploadConfigurationVersion(ws.ID, "hoi")
+ _ = createAndUploadConfigurationVersion(ws.ID, "hoi", true)
Eventually(func() bool {
ws, err = tfClient.Workspaces.ReadByID(ctx, ws.ID)
Expect(err).Should(Succeed())
diff --git a/internal/controller/workspace_controller_deletion_policy.go b/internal/controller/workspace_controller_deletion_policy.go
index 47e3764a..27ffc4ee 100644
--- a/internal/controller/workspace_controller_deletion_policy.go
+++ b/internal/controller/workspace_controller_deletion_policy.go
@@ -101,6 +101,39 @@ func (r *WorkspaceReconciler) deleteWorkspace(ctx context.Context, w *workspaceI
if _, ok := runStatusUnsuccessful[run.Status]; ok {
w.log.Info("Destroy Run", "msg", fmt.Sprintf("destroy run %s is unsuccessful: %s", run.ID, run.Status))
+
+ workspace, err := w.tfClient.Client.Workspaces.ReadByID(ctx, w.instance.Status.WorkspaceID)
+ if err != nil {
+ return r.handleWorkspaceErrorNotFound(ctx, w, err)
+ }
+
+ w.log.Info("Destroy Run", "msg", fmt.Sprintf("CurrentRun: %s %s %v", workspace.CurrentRun.ID, workspace.CurrentRun.Status, workspace.CurrentRun.IsDestroy))
+
+ if workspace.CurrentRun != nil && workspace.CurrentRun.ID != w.instance.Status.DestroyRunID {
+
+ run, err := w.tfClient.Client.Runs.Read(ctx, w.instance.Status.DestroyRunID)
+ if err != nil {
+ // ignore this run id, and let the next reconcile loop handle the error
+ return nil
+ }
+ if run.IsDestroy {
+ w.log.Info("Destroy Run", "msg", fmt.Sprintf("found more recent destroy run %s, updating DestroyRunID", workspace.CurrentRun.ID))
+
+ w.instance.Status.DestroyRunID = workspace.CurrentRun.ID
+ w.updateWorkspaceStatusRun(run)
+ return r.Status().Update(ctx, &w.instance)
+ }
+ }
+ if isRetryEnabled(w) {
+ w.log.Info("Destroy Run", "msg", fmt.Sprintf("ongoing destroy run %s is unsuccessful, retrying it", run.ID))
+ err := r.retryFailedDestroyRun(ctx, w, workspace, run)
+ if err != nil {
+ w.log.Info("Destroy Run", "msg", fmt.Sprintf("ongoing destroy run %s is unsuccessful, retrying it", run.ID))
+ return err
+ }
+ return r.Status().Update(ctx, &w.instance)
+ }
+
return nil
}
w.log.Info("Destroy Run", "msg", fmt.Sprintf("destroy run %s is not finished", run.ID))
diff --git a/internal/controller/workspace_controller_deletion_policy_test.go b/internal/controller/workspace_controller_deletion_policy_test.go
index 68e54dfe..c7d5c8b2 100644
--- a/internal/controller/workspace_controller_deletion_policy_test.go
+++ b/internal/controller/workspace_controller_deletion_policy_test.go
@@ -97,7 +97,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
createWorkspace(instance)
workspaceID := instance.Status.WorkspaceID
- cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+ cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
Eventually(func() bool {
listOpts := tfc.ListOptions{
PageNumber: 1,
@@ -144,7 +144,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
createWorkspace(instance)
workspaceID := instance.Status.WorkspaceID
- cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+ cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
Eventually(func() bool {
listOpts := tfc.ListOptions{
PageNumber: 1,
@@ -195,6 +195,97 @@ var _ = Describe("Workspace controller", Ordered, func() {
return err == tfc.ErrResourceNotFound
}).Should(BeTrue())
})
+ It("can destroy delete a workspace when the destroy was retried manually after failing", func() {
+ if cloudEndpoint != tfcDefaultAddress {
+ Skip("Does not run against TFC, skip this test")
+ }
+ instance.Spec.AllowDestroyPlan = true
+ instance.Spec.DeletionPolicy = appv1alpha2.DeletionPolicyDestroy
+ createWorkspace(instance)
+ workspaceID := instance.Status.WorkspaceID
+
+ cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
+ Eventually(func() bool {
+ listOpts := tfc.ListOptions{
+ PageNumber: 1,
+ PageSize: maxPageSize,
+ }
+ for listOpts.PageNumber != 0 {
+ runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+ ListOptions: listOpts,
+ })
+ Expect(err).To(Succeed())
+ for _, r := range runs.Items {
+ if r.ConfigurationVersion.ID == cv.ID {
+ return r.Status == tfc.RunApplied
+ }
+ }
+ listOpts.PageNumber = runs.NextPage
+ }
+ return false
+ }).Should(BeTrue())
+
+ // create an errored ConfigurationVersion for the delete to fail
+ cv = createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, false)
+
+ Expect(k8sClient.Delete(ctx, instance)).To(Succeed())
+
+ var destroyRunID string
+ Eventually(func() bool {
+ ws, err := tfClient.Workspaces.ReadByID(ctx, workspaceID)
+ Expect(err).To(Succeed())
+ Expect(ws).ToNot(BeNil())
+ Expect(ws.CurrentRun).ToNot(BeNil())
+ run, err := tfClient.Runs.Read(ctx, ws.CurrentRun.ID)
+ Expect(err).To(Succeed())
+ Expect(run).ToNot(BeNil())
+ destroyRunID = run.ID
+
+ return run.IsDestroy
+ }).Should(BeTrue())
+
+ Eventually(func() bool {
+ run, _ := tfClient.Runs.Read(ctx, destroyRunID)
+ if run.Status == tfc.RunErrored {
+ return true
+ }
+
+ return false
+ }).Should(BeTrue())
+
+ // put back a working configuration
+ cv = createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
+
+ // start a new destroy run manually
+ run, err := tfClient.Runs.Create(ctx, tfc.RunCreateOptions{
+ IsDestroy: tfc.Bool(true),
+ Message: tfc.String(runMessage),
+ Workspace: &tfc.Workspace{
+ ID: workspaceID,
+ },
+ })
+ Expect(err).To(Succeed())
+ Expect(run).ToNot(BeNil())
+
+ var newDestroyRunID string
+ Eventually(func() bool {
+ ws, err := tfClient.Workspaces.ReadByID(ctx, workspaceID)
+ Expect(err).To(Succeed())
+ Expect(ws).ToNot(BeNil())
+ Expect(ws.CurrentRun).ToNot(BeNil())
+ run, err := tfClient.Runs.Read(ctx, ws.CurrentRun.ID)
+ Expect(err).To(Succeed())
+ Expect(run).ToNot(BeNil())
+ newDestroyRunID = run.ID
+
+ return run.IsDestroy && newDestroyRunID != destroyRunID
+ }).Should(BeTrue())
+
+ Eventually(func() bool {
+ _, err := tfClient.Workspaces.ReadByID(ctx, workspaceID)
+ return err == tfc.ErrResourceNotFound
+ }).Should(BeTrue())
+ })
It("can force delete a workspace", func() {
instance.Spec.DeletionPolicy = appv1alpha2.DeletionPolicyForce
createWorkspace(instance)
diff --git a/internal/controller/workspace_controller_outputs_test.go b/internal/controller/workspace_controller_outputs_test.go
index e9a3be34..e2b1a897 100644
--- a/internal/controller/workspace_controller_outputs_test.go
+++ b/internal/controller/workspace_controller_outputs_test.go
@@ -76,7 +76,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
createWorkspace(instance)
outputValue := "hoi"
- cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, outputValue)
+ cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, outputValue, true)
By("Validating configuration version and workspace run")
Eventually(func() bool {
@@ -132,7 +132,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
})
})
-func createAndUploadConfigurationVersion(workspaceID string, outputValue string) *tfc.ConfigurationVersion {
+func createAndUploadConfigurationVersion(workspaceID string, outputValue string, autoQueueRuns bool) *tfc.ConfigurationVersion {
GinkgoHelper()
// Create a temporary dir in the current one
cd, err := os.Getwd()
@@ -140,7 +140,7 @@ func createAndUploadConfigurationVersion(workspaceID string, outputValue string)
td, err := os.MkdirTemp(cd, "tf-*")
Expect(err).Should(Succeed())
defer os.RemoveAll(td)
- // Create a temporary file in the temporary dir
+ // Create a te AutoQueueRuns: tfc.Bool(autoQueueRuns), dir
f, err := os.CreateTemp(td, "*.tf")
Expect(err).Should(Succeed())
defer os.Remove(f.Name())
@@ -159,7 +159,50 @@ func createAndUploadConfigurationVersion(workspaceID string, outputValue string)
Expect(err).Should(Succeed())
cv, err := tfClient.ConfigurationVersions.Create(ctx, workspaceID, tfc.ConfigurationVersionCreateOptions{
- AutoQueueRuns: tfc.Bool(true),
+ AutoQueueRuns: tfc.Bool(autoQueueRuns),
+ Speculative: tfc.Bool(false),
+ })
+ Expect(err).Should(Succeed())
+ Expect(cv).ShouldNot(BeNil())
+
+ Expect(tfClient.ConfigurationVersions.Upload(ctx, cv.UploadURL, td)).Should(Succeed())
+
+ Eventually(func() bool {
+ c, err := tfClient.ConfigurationVersions.Read(ctx, cv.ID)
+ if err != nil {
+ return false
+ }
+ if c.Status == tfc.ConfigurationUploaded {
+ return true
+ }
+ return false
+ }).Should(BeTrue())
+
+ return cv
+}
+
+func createAndUploadErroredConfigurationVersion(workspaceID string, autoQueueRuns bool) *tfc.ConfigurationVersion {
+ GinkgoHelper()
+ // Create a temporary dir in the current one
+ cd, err := os.Getwd()
+ Expect(err).Should(Succeed())
+ td, err := os.MkdirTemp(cd, "tf-*")
+ Expect(err).Should(Succeed())
+ defer os.RemoveAll(td)
+ // Create a temporary file in the temporary dir
+ f, err := os.CreateTemp(td, "*.tf")
+ Expect(err).Should(Succeed())
+ defer os.Remove(f.Name())
+ // Terraform code to upload
+ tf := fmt.Sprint(`
+ resource "test_non_existent_resource" "this" {}
+ `)
+ // Save the Terraform code to the temporary file
+ _, err = f.WriteString(tf)
+ Expect(err).Should(Succeed())
+
+ cv, err := tfClient.ConfigurationVersions.Create(ctx, workspaceID, tfc.ConfigurationVersionCreateOptions{
+ AutoQueueRuns: tfc.Bool(autoQueueRuns),
Speculative: tfc.Bool(false),
})
Expect(err).Should(Succeed())
diff --git a/internal/controller/workspace_controller_retry.go b/internal/controller/workspace_controller_retry.go
new file mode 100644
index 00000000..cfc924f3
--- /dev/null
+++ b/internal/controller/workspace_controller_retry.go
@@ -0,0 +1,93 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: MPL-2.0
+
+package controller
+
+import (
+ "context"
+ "fmt"
+
+ tfc "github.com/hashicorp/go-tfe"
+ appv1alpha2 "github.com/hashicorp/hcp-terraform-operator/api/v1alpha2"
+)
+
+func (r *WorkspaceReconciler) resetRetryStatus(ctx context.Context, w *workspaceInstance) error {
+ if !isRetryEnabled(w) {
+ return nil
+ }
+ w.instance.Status.Retry = &appv1alpha2.RetryStatus{
+ Failed: 0,
+ }
+ return nil
+}
+
+func isRetryEnabled(w *workspaceInstance) bool {
+ return w.instance.Spec.RetryPolicy != nil && w.instance.Spec.RetryPolicy.BackoffLimit != 0
+}
+
+func (r *WorkspaceReconciler) retryFailedApplyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error {
+ retriedRun, err := r.retryFailedRun(ctx, w, workspace, failedRun)
+ if err != nil {
+ return err
+ }
+
+ // when no run is returned, it means the backoff limit was reached
+ if retriedRun == nil {
+ return nil
+ }
+
+ w.updateWorkspaceStatusRun(retriedRun)
+ // WARNING: there is a race limit here in case the run fails very fast and the initial status returned
+ // by the Runs.Create funtion is Errored. In this case the run is never retried.
+ // TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to
+ // make sure we didn't miss a retry
+
+ return nil
+}
+
+func (r *WorkspaceReconciler) retryFailedDestroyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error {
+ retriedRun, err := r.retryFailedRun(ctx, w, workspace, failedRun)
+ if err != nil {
+ return err
+ }
+
+ // when no run is returned, it means the backoff limit was reached
+ if retriedRun == nil {
+ return nil
+ }
+
+ w.instance.Status.DestroyRunID = retriedRun.ID
+ w.updateWorkspaceStatusRun(retriedRun)
+
+ return nil
+}
+
+func (r *WorkspaceReconciler) retryFailedRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) (*tfc.Run, error) {
+ if w.instance.Status.Retry == nil {
+ w.instance.Status.Retry = &appv1alpha2.RetryStatus{
+ Failed: 0,
+ }
+ }
+ w.instance.Status.Retry.Failed++
+
+ if w.instance.Spec.RetryPolicy.BackoffLimit < 0 || w.instance.Status.Retry.Failed <= w.instance.Spec.RetryPolicy.BackoffLimit {
+
+ options := tfc.RunCreateOptions{
+ Message: tfc.String(runMessage),
+ Workspace: workspace,
+ IsDestroy: tfc.Bool(failedRun.IsDestroy),
+ RefreshOnly: tfc.Bool(failedRun.RefreshOnly),
+ }
+ retriedRun, err := w.tfClient.Client.Runs.Create(ctx, options)
+ if err != nil {
+ w.log.Error(err, "Retry Runs", "msg", "failed to create a new apply run for retry")
+ return nil, err
+ }
+ w.log.Info("Retry Runs", "msg", fmt.Sprintf("successfully created a new apply run %s for to retry failed %s", retriedRun.ID, failedRun.ID))
+
+ return retriedRun, nil
+ } else {
+ w.log.Info("Retry Runs", "msg", "backoff limit was reached, skip retry")
+ return nil, nil
+ }
+}
diff --git a/internal/controller/workspace_controller_retry_test.go b/internal/controller/workspace_controller_retry_test.go
new file mode 100644
index 00000000..902ae6a1
--- /dev/null
+++ b/internal/controller/workspace_controller_retry_test.go
@@ -0,0 +1,245 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: MPL-2.0
+
+package controller
+
+import (
+ "fmt"
+ "time"
+
+ tfc "github.com/hashicorp/go-tfe"
+ appv1alpha2 "github.com/hashicorp/hcp-terraform-operator/api/v1alpha2"
+ . "github.com/onsi/ginkgo/v2"
+ . "github.com/onsi/gomega"
+ corev1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/types"
+)
+
+var _ = Describe("Workspace controller", Ordered, func() {
+ var (
+ instance *appv1alpha2.Workspace
+ namespacedName types.NamespacedName
+ workspace string
+ )
+
+ BeforeAll(func() {
+ // Set default Eventually timers
+ SetDefaultEventuallyTimeout(syncPeriod * 4)
+ SetDefaultEventuallyPollingInterval(2 * time.Second)
+ })
+
+ BeforeEach(func() {
+ if cloudEndpoint != tfcDefaultAddress {
+ Skip("Does not run against TFC, skip this test")
+ }
+ namespacedName = newNamespacedName()
+ workspace = fmt.Sprintf("kubernetes-operator-%v", randomNumber())
+ // Create a new workspace object for each test
+ instance = &appv1alpha2.Workspace{
+ TypeMeta: metav1.TypeMeta{
+ APIVersion: "app.terraform.io/v1alpha2",
+ Kind: "Workspace",
+ },
+ ObjectMeta: metav1.ObjectMeta{
+ Name: namespacedName.Name,
+ Namespace: namespacedName.Namespace,
+ DeletionTimestamp: nil,
+ Finalizers: []string{},
+ },
+ Spec: appv1alpha2.WorkspaceSpec{
+ Organization: organization,
+ Token: appv1alpha2.Token{
+ SecretKeyRef: &corev1.SecretKeySelector{
+ LocalObjectReference: corev1.LocalObjectReference{
+ Name: secretNamespacedName.Name,
+ },
+ Key: secretKey,
+ },
+ },
+ Name: workspace,
+ ApplyMethod: "auto",
+ ApplyRunTrigger: "auto",
+ },
+ Status: appv1alpha2.WorkspaceStatus{},
+ }
+ })
+
+ AfterEach(func() {
+ deleteWorkspace(instance)
+ })
+
+ Context("Retry", func() {
+ It("can retry failed runs", func() {
+ namespacedName := getNamespacedName(instance)
+ instance.Spec.RetryPolicy = &appv1alpha2.RetryPolicy{
+ BackoffLimit: -1,
+ }
+ // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
+ createWorkspace(instance)
+ workspaceID := instance.Status.WorkspaceID
+
+ // start a run that will fail
+ cv := createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true)
+
+ var runID string
+
+ Eventually(func() bool {
+ listOpts := tfc.ListOptions{
+ PageNumber: 1,
+ PageSize: maxPageSize,
+ }
+ for listOpts.PageNumber != 0 {
+ runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+ ListOptions: listOpts,
+ })
+ Expect(err).To(Succeed())
+ for _, r := range runs.Items {
+ if r.ConfigurationVersion.ID == cv.ID {
+ runID = r.ID
+ return r.Status == tfc.RunErrored
+ }
+ }
+ listOpts.PageNumber = runs.NextPage
+ }
+ return false
+ }).Should(BeTrue())
+
+ // Fix the code but no not start a run manually
+ createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", false)
+
+ // a new run should be started automatically
+ Eventually(func() bool {
+ Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
+ if instance.Status.Run == nil {
+ return false
+ }
+ return runID != instance.Status.Run.ID
+ }).Should(BeTrue())
+
+ // the number of failed attemps should be reset to 0
+ Eventually(func() bool {
+ Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
+ if instance.Status.Retry == nil {
+ return false
+ }
+ return instance.Status.Retry.Failed == 0
+ }).Should(BeTrue())
+
+ // Since the code is fixed at some point a run will succeed
+ Eventually(func() bool {
+ Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
+ if instance.Status.Run == nil {
+ return false
+ }
+
+ return runID != instance.Status.Run.ID && instance.Status.Run.RunCompleted()
+ }).Should(BeTrue())
+ })
+ It("can retry until the limit of retries is reached", func() {
+ namespacedName := getNamespacedName(instance)
+
+ instance.Spec.RetryPolicy = &appv1alpha2.RetryPolicy{
+ BackoffLimit: 2,
+ }
+ // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
+
+ createWorkspace(instance)
+ workspaceID := instance.Status.WorkspaceID
+
+ // start a run that will fail
+ createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true)
+
+ Eventually(func() bool {
+ Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
+ if instance.Status.Retry == nil {
+ return false
+ }
+
+ return instance.Status.Retry.Failed == 3
+ }).Should(BeTrue())
+
+ Eventually(func() bool {
+ listOpts := tfc.ListOptions{
+ PageNumber: 1,
+ PageSize: maxPageSize,
+ }
+ runCount := 0
+ for listOpts.PageNumber != 0 {
+ runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+ ListOptions: listOpts,
+ })
+ Expect(err).To(Succeed())
+ runCount += len(runs.Items)
+ listOpts.PageNumber = runs.NextPage
+ }
+ return runCount == 3
+ }).Should(BeTrue())
+ })
+ It("can retry failed destroy runs when deleting the workspace", func() {
+ instance.Spec.RetryPolicy = &appv1alpha2.RetryPolicy{
+ BackoffLimit: -1,
+ }
+ instance.Spec.AllowDestroyPlan = true
+ instance.Spec.DeletionPolicy = appv1alpha2.DeletionPolicyDestroy
+ // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
+ createWorkspace(instance)
+ workspaceID := instance.Status.WorkspaceID
+
+ cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
+ Eventually(func() bool {
+ listOpts := tfc.ListOptions{
+ PageNumber: 1,
+ PageSize: maxPageSize,
+ }
+ for listOpts.PageNumber != 0 {
+ runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+ ListOptions: listOpts,
+ })
+ Expect(err).To(Succeed())
+ for _, r := range runs.Items {
+ if r.ConfigurationVersion.ID == cv.ID {
+ return r.Status == tfc.RunApplied
+ }
+ }
+ listOpts.PageNumber = runs.NextPage
+ }
+ return false
+ }).Should(BeTrue())
+
+ // create an errored ConfigurationVersion for the delete to fail
+ cv = createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, false)
+
+ Expect(k8sClient.Delete(ctx, instance)).To(Succeed())
+
+ Eventually(func() bool {
+ listOpts := tfc.ListOptions{
+ PageNumber: 1,
+ PageSize: maxPageSize,
+ }
+ for listOpts.PageNumber != 0 {
+ runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+ ListOptions: listOpts,
+ })
+ Expect(err).To(Succeed())
+ for _, r := range runs.Items {
+ if r.ConfigurationVersion.ID == cv.ID {
+ return r.Status == tfc.RunErrored
+ }
+ }
+ listOpts.PageNumber = runs.NextPage
+ }
+ return false
+ }).Should(BeTrue())
+
+ // Fix the code but no not start a run manually
+ createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", false)
+
+ // The retry should eventually delete the workspace
+ Eventually(func() bool {
+ _, err := tfClient.Workspaces.ReadByID(ctx, workspaceID)
+ return err == tfc.ErrResourceNotFound
+ }).Should(BeTrue())
+ })
+ })
+})
diff --git a/internal/controller/workspace_controller_runs.go b/internal/controller/workspace_controller_runs.go
index 30cdc728..9c498241 100644
--- a/internal/controller/workspace_controller_runs.go
+++ b/internal/controller/workspace_controller_runs.go
@@ -84,6 +84,21 @@ func (r *WorkspaceReconciler) reconcileCurrentRun(ctx context.Context, w *worksp
w.instance.Status.Run.Status = string(run.Status)
w.instance.Status.Run.ConfigurationVersion = run.ConfigurationVersion.ID
+ if isRetryEnabled(w) {
+ if _, ok := runStatusUnsuccessful[run.Status]; ok {
+ w.log.Info("Reconcile Runs", "msg", "ongoing non-speculative run is unsuccessful, retrying it")
+
+ if err = r.retryFailedApplyRun(ctx, w, workspace, run); err != nil {
+ return err
+ }
+ }
+ }
+
+ // when the current run succeeds, we reset the failed counter for a next retry
+ if _, ok := runStatusComplete[run.Status]; ok {
+ r.resetRetryStatus(ctx, w)
+ }
+
return nil
}
@@ -134,6 +149,11 @@ func (r *WorkspaceReconciler) triggerApplyRun(ctx context.Context, w *workspaceI
w.instance.Status.Run.Status = string(run.Status)
w.instance.Status.Run.ConfigurationVersion = run.ConfigurationVersion.ID
+ // WARNING: there is a race limit here in case the run fails very fast and the initial status returned
+ // by the Runs.Create funtion is Errored. In this case the run is never retried.
+ // TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to
+ // make sure we didn't miss a retry
+
return nil
}
diff --git a/internal/controller/workspace_controller_runs_test.go b/internal/controller/workspace_controller_runs_test.go
index f2bb5421..6d9b0f41 100644
--- a/internal/controller/workspace_controller_runs_test.go
+++ b/internal/controller/workspace_controller_runs_test.go
@@ -73,7 +73,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
namespacedName := getNamespacedName(instance)
// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
createWorkspace(instance)
- createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+ createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
Eventually(func() bool {
Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
if instance.Status.Run == nil {
@@ -87,7 +87,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
createWorkspace(instance)
- createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+ createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
Eventually(func() bool {
Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
if instance.Status.Run == nil {