diff --git a/pkg/controller/pxcbackup/controller.go b/pkg/controller/pxcbackup/controller.go index 8b6f6785f..67d3604d1 100644 --- a/pkg/controller/pxcbackup/controller.go +++ b/pkg/controller/pxcbackup/controller.go @@ -197,17 +197,28 @@ func (r *ReconcilePerconaXtraDBClusterBackup) Reconcile(ctx context.Context, req }() if err := r.checkDeadlines(ctx, cluster, cr); err != nil { + // There was an error with checking the deadline itself, we must retry. + if !errors.Is(err, errDeadlineExceeded) { + return reconcile.Result{}, errors.Wrap(err, "check deadlines") + } + // If a deadline was exceeded, we must set the status to failed so the backup can stop. if err := r.setFailedStatus(ctx, cr, err); err != nil { - return rr, errors.Wrap(err, "update status") + return reconcile.Result{}, errors.Wrap(err, "update status") } if errors.Is(err, errSuspendedDeadlineExceeded) { log.Info("cleaning up suspended backup job") - if err := r.cleanUpSuspendedJob(ctx, cluster, cr); err != nil { + if err := r.cleanUpJob(ctx, cluster, cr); err != nil { return reconcile.Result{}, errors.Wrap(err, "clean up suspended job") } } + if errors.Is(err, errRunningDeadlineExceeded) { + log.Info("running deadline exceeded, deleting the job and its pods") + if err := r.cleanUpJob(ctx, cluster, cr); err != nil { + return reconcile.Result{}, errors.Wrap(err, "clean up running deadline exceeded job") + } + } return reconcile.Result{}, nil } @@ -863,7 +874,7 @@ func (r *ReconcilePerconaXtraDBClusterBackup) getBackupJob( return job, nil } -func (r *ReconcilePerconaXtraDBClusterBackup) cleanUpSuspendedJob( +func (r *ReconcilePerconaXtraDBClusterBackup) cleanUpJob( ctx context.Context, cluster *api.PerconaXtraDBCluster, cr *api.PerconaXtraDBClusterBackup, @@ -873,7 +884,8 @@ func (r *ReconcilePerconaXtraDBClusterBackup) cleanUpSuspendedJob( return errors.Wrap(err, "get job") } - if err := r.client.Delete(ctx, job); err != nil { + // Set propagationPolicy=background because default deletion preserves pods. + if err := r.client.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)); err != nil { return errors.Wrap(err, "delete job") } diff --git a/pkg/controller/pxcbackup/controller_test.go b/pkg/controller/pxcbackup/controller_test.go new file mode 100644 index 000000000..e5d531762 --- /dev/null +++ b/pkg/controller/pxcbackup/controller_test.go @@ -0,0 +1,276 @@ +package pxcbackup + +import ( + "context" + "sync" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/percona/percona-xtradb-cluster-operator/clientcmd" + pxcv1 "github.com/percona/percona-xtradb-cluster-operator/pkg/apis/pxc/v1" + "github.com/percona/percona-xtradb-cluster-operator/pkg/pxc/backup" + "github.com/percona/percona-xtradb-cluster-operator/pkg/version" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +var _ = Describe("PerconaXtraDBClusterBackup", Ordered, func() { + ctx := context.Background() + const ns = "pxc" + namespace := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + Namespace: ns, + }, + } + clusterName := "cluster1" + cluster := &pxcv1.PerconaXtraDBCluster{} + + reconciler := &ReconcilePerconaXtraDBClusterBackup{} + BeforeAll(func() { + By("Creating the Namespace to perform the tests") + err := k8sClient.Create(ctx, namespace) + Expect(err).To(Not(HaveOccurred())) + + By("Creating a PXC Cluster to perform the tests") + cluster, err = readDefaultCR(clusterName, ns) + Expect(err).To(Not(HaveOccurred())) + + err = k8sClient.Create(ctx, cluster) + Expect(err).To(Not(HaveOccurred())) + + mockPXCReadyStatus(ctx, cluster) + reconciler = newTestReconciler() + }) + + AfterAll(func() { + // TODO(user): Attention if you improve this code by adding other context test you MUST + // be aware of the current delete namespace limitations. More info: https://book.kubebuilder.io/reference/envtest.html#testing-considerations + By("Deleting the Namespace to perform the tests") + _ = k8sClient.Delete(ctx, namespace) + + By("Deleting the PXC Cluster to perform the tests") + cluster, err := readDefaultCR(clusterName, ns) + Expect(err).To(Not(HaveOccurred())) + err = k8sClient.Delete(ctx, cluster) + Expect(err).To(Not(HaveOccurred())) + }) + + It("Should reconcile backup", func() { + pxcBackup, err := newBackup("backup1", ns) + Expect(err).To(Not(HaveOccurred())) + + err = k8sClient.Create(ctx, pxcBackup) + Expect(err).To(Not(HaveOccurred())) + + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{ + Name: pxcBackup.Name, + Namespace: pxcBackup.Namespace, + }}) + Expect(err).To(Succeed()) + + // Check that a job was created + bcp := backup.New(cluster) + job := bcp.Job(pxcBackup, cluster) + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: job.Name, + Namespace: job.Namespace, + }, job) + Expect(err).To(Not(HaveOccurred())) + }) +}) + +var _ = Describe("Error checking deadlines", Ordered, func() { + ctx := context.Background() + const ns = "pxc-deadline-error" + namespace := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + Namespace: ns, + }, + } + clusterName := "cluster1" + cluster := &pxcv1.PerconaXtraDBCluster{} + reconciler := &ReconcilePerconaXtraDBClusterBackup{} + BeforeAll(func() { + By("Creating the Namespace to perform the tests") + err := k8sClient.Create(ctx, namespace) + Expect(err).To(Not(HaveOccurred())) + + By("Creating a PXC Cluster to perform the tests") + cluster, err = readDefaultCR(clusterName, ns) + Expect(err).To(Not(HaveOccurred())) + + err = k8sClient.Create(ctx, cluster) + Expect(err).To(Not(HaveOccurred())) + + mockPXCReadyStatus(ctx, cluster) + reconciler = newTestReconciler() + }) + + AfterAll(func() { + // TODO(user): Attention if you improve this code by adding other context test you MUST + // be aware of the current delete namespace limitations. More info: https://book.kubebuilder.io/reference/envtest.html#testing-considerations + By("Deleting the Namespace to perform the tests") + _ = k8sClient.Delete(ctx, namespace) + + By("Deleting the PXC Cluster to perform the tests") + cluster, err := readDefaultCR(clusterName, ns) + Expect(err).To(Not(HaveOccurred())) + err = k8sClient.Delete(ctx, cluster) + Expect(err).To(Not(HaveOccurred())) + }) + + It("Should not fail when error checking deadline", func() { + pxcBackup, err := newBackup("backup1", ns) + Expect(err).To(Not(HaveOccurred())) + pxcBackup.Spec.RunningDeadlineSeconds = ptr.To(int64(300)) + + pxcBackupReq := reconcile.Request{NamespacedName: types.NamespacedName{ + Name: pxcBackup.Name, + Namespace: pxcBackup.Namespace, + }} + + err = k8sClient.Create(ctx, pxcBackup) + Expect(err).To(Not(HaveOccurred())) + + _, err = reconciler.Reconcile(ctx, pxcBackupReq) + Expect(err).To(Succeed()) + + err = k8sClient.Get(ctx, pxcBackupReq.NamespacedName, pxcBackup) + Expect(err).To(Not(HaveOccurred())) + Expect(pxcBackup.Status.State).To(Equal(pxcv1.BackupStarting)) + + // We will delete the job, this will cause the deadline check to fail + job := backup.New(cluster).Job(pxcBackup, cluster) + err = k8sClient.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)) + Expect(err).To(Not(HaveOccurred())) + + _, err = reconciler.Reconcile(ctx, pxcBackupReq) + Expect(err).To((HaveOccurred())) + Expect(err.Error()).To(ContainSubstring("check deadlines")) + + // Make sure that the backup is not marked as failed + err = k8sClient.Get(ctx, pxcBackupReq.NamespacedName, pxcBackup) + Expect(pxcBackup.Status.State).To(Equal(pxcv1.BackupStarting)) + }) +}) + +var _ = Describe("Backup Job deleted when running deadline is exceeded", Ordered, func() { + ctx := context.Background() + const ns = "pxc-job-running-deadline-exceeded" + namespace := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + Namespace: ns, + }, + } + clusterName := "cluster1" + cluster := &pxcv1.PerconaXtraDBCluster{} + reconciler := &ReconcilePerconaXtraDBClusterBackup{} + BeforeAll(func() { + By("Creating the Namespace to perform the tests") + err := k8sClient.Create(ctx, namespace) + Expect(err).To(Not(HaveOccurred())) + + By("Creating a PXC Cluster to perform the tests") + cluster, err = readDefaultCR(clusterName, ns) + Expect(err).To(Not(HaveOccurred())) + + err = k8sClient.Create(ctx, cluster) + Expect(err).To(Not(HaveOccurred())) + + mockPXCReadyStatus(ctx, cluster) + reconciler = newTestReconciler() + }) + + AfterAll(func() { + // TODO(user): Attention if you improve this code by adding other context test you MUST + // be aware of the current delete namespace limitations. More info: https://book.kubebuilder.io/reference/envtest.html#testing-considerations + By("Deleting the Namespace to perform the tests") + _ = k8sClient.Delete(ctx, namespace) + + By("Deleting the PXC Cluster to perform the tests") + cluster, err := readDefaultCR(clusterName, ns) + Expect(err).To(Not(HaveOccurred())) + err = k8sClient.Delete(ctx, cluster) + Expect(err).To(Not(HaveOccurred())) + }) + + It("Should delete the backup job when running deadline is exceeded", func() { + pxcBackup, err := newBackup("backup1", ns) + Expect(err).To(Not(HaveOccurred())) + pxcBackup.Spec.RunningDeadlineSeconds = ptr.To(int64(5)) + + pxcBackupReq := reconcile.Request{NamespacedName: types.NamespacedName{ + Name: pxcBackup.Name, + Namespace: pxcBackup.Namespace, + }} + + err = k8sClient.Create(ctx, pxcBackup) + Expect(err).To(Not(HaveOccurred())) + + _, err = reconciler.Reconcile(ctx, pxcBackupReq) + Expect(err).To(Succeed()) + + err = k8sClient.Get(ctx, pxcBackupReq.NamespacedName, pxcBackup) + Expect(err).To(Not(HaveOccurred())) + Expect(pxcBackup.Status.State).To(Equal(pxcv1.BackupStarting)) + + time.Sleep(6 * time.Second) + + _, err = reconciler.Reconcile(ctx, pxcBackupReq) + Expect(err).To(Not(HaveOccurred())) + + // Make sure that the backup is marked as failed + err = k8sClient.Get(ctx, pxcBackupReq.NamespacedName, pxcBackup) + Expect(err).To(Not(HaveOccurred())) + Expect(pxcBackup.Status.State).To(Equal(pxcv1.BackupFailed)) + Expect(pxcBackup.Status.Error).To(ContainSubstring("running deadline seconds exceeded")) + + // Make sure that the job is deleted + job := backup.New(cluster).Job(pxcBackup, cluster) + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: job.Name, + Namespace: job.Namespace, + }, job) + Expect(err).To(HaveOccurred()) + Expect(k8serrors.IsNotFound(err)).To(BeTrue()) + }) +}) + +func newBackup(name, ns string) (*pxcv1.PerconaXtraDBClusterBackup, error) { + bkp, err := readDefaultBackup(name, ns) + if err != nil { + return nil, err + } + bkp.Spec.PXCCluster = "cluster1" + return bkp, nil +} + +func newTestReconciler() *ReconcilePerconaXtraDBClusterBackup { + cli, _ := clientcmd.NewClient() + return &ReconcilePerconaXtraDBClusterBackup{ + client: k8sClient, + scheme: k8sClient.Scheme(), + serverVersion: &version.ServerVersion{ + Platform: version.PlatformKubernetes, + }, + clientcmd: cli, + chLimit: make(chan struct{}, 10), + bcpDeleteInProgress: new(sync.Map), + } +} + +func mockPXCReadyStatus(ctx context.Context, cluster *pxcv1.PerconaXtraDBCluster) { + cluster.Status.Status = pxcv1.AppStateReady + cluster.Status.PXC.Ready = cluster.Spec.PXC.Size + err := k8sClient.Status().Update(ctx, cluster) + Expect(err).To(Not(HaveOccurred())) +} diff --git a/pkg/controller/pxcbackup/deadline.go b/pkg/controller/pxcbackup/deadline.go index b2ddf58a3..3397bfd96 100644 --- a/pkg/controller/pxcbackup/deadline.go +++ b/pkg/controller/pxcbackup/deadline.go @@ -14,10 +14,12 @@ import ( api "github.com/percona/percona-xtradb-cluster-operator/pkg/apis/pxc/v1" ) +var errDeadlineExceeded = errors.New("deadline exceeded") + var ( - errSuspendedDeadlineExceeded = errors.New("suspended deadline seconds exceeded") - errStartingDeadlineExceeded = errors.New("starting deadline seconds exceeded") - errRunningDeadlineExceeded = errors.New("running deadline seconds exceeded") + errSuspendedDeadlineExceeded = errors.Wrap(errDeadlineExceeded, "suspended deadline seconds exceeded") + errStartingDeadlineExceeded = errors.Wrap(errDeadlineExceeded, "starting deadline seconds exceeded") + errRunningDeadlineExceeded = errors.Wrap(errDeadlineExceeded, "running deadline seconds exceeded") ) func (r *ReconcilePerconaXtraDBClusterBackup) checkDeadlines(ctx context.Context, cluster *api.PerconaXtraDBCluster, cr *api.PerconaXtraDBClusterBackup) error { diff --git a/pkg/controller/pxcbackup/deadline_test.go b/pkg/controller/pxcbackup/deadline_test.go index 6a922fb07..d40398803 100644 --- a/pkg/controller/pxcbackup/deadline_test.go +++ b/pkg/controller/pxcbackup/deadline_test.go @@ -302,7 +302,7 @@ var _ = Describe("Suspended deadline", func() { err = r.checkSuspendedDeadline(context.Background(), cluster, cr) Expect(err).To(HaveOccurred()) - err = r.cleanUpSuspendedJob(context.Background(), cluster, cr) + err = r.cleanUpJob(context.Background(), cluster, cr) Expect(err).NotTo(HaveOccurred()) j := new(batchv1.Job) diff --git a/pkg/controller/pxcbackup/suite_test.go b/pkg/controller/pxcbackup/suite_test.go index b1eb709fb..52d967068 100644 --- a/pkg/controller/pxcbackup/suite_test.go +++ b/pkg/controller/pxcbackup/suite_test.go @@ -12,19 +12,63 @@ import ( "k8s.io/apimachinery/pkg/util/yaml" k8sversion "k8s.io/apimachinery/pkg/version" "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" // nolint + "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "github.com/percona/percona-xtradb-cluster-operator/pkg/apis" pxcv1 "github.com/percona/percona-xtradb-cluster-operator/pkg/apis/pxc/v1" "github.com/percona/percona-xtradb-cluster-operator/pkg/version" ) +var ( + cfg *rest.Config + k8sClient client.Client + testEnv *envtest.Environment +) + func TestPxcbackup(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "PerconaXtraDBClusterBackup Suite") } +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + errEnv := os.Setenv("WATCH_NAMESPACE", "default") + Expect(errEnv).NotTo(HaveOccurred()) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + } + + var err error + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + err = apis.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + errEnv := os.Unsetenv("WATCH_NAMESPACE") + Expect(errEnv).NotTo(HaveOccurred()) + + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) + func readDefaultCR(name, namespace string) (*pxcv1.PerconaXtraDBCluster, error) { data, err := os.ReadFile(filepath.Join("..", "..", "..", "deploy", "cr.yaml")) if err != nil {