From cb6172cfb7fd018deae76f9f7e83660ac15b7db9 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Thu, 27 Nov 2025 18:43:04 +0800 Subject: [PATCH 01/34] [Bug][RayJob] Sidecar mode shouldn't restart head pod when head pod is deleted Signed-off-by: 400Ping --- .../controllers/ray/raycluster_controller.go | 16 ++++++++-------- .../controllers/ray/rayjob_controller.go | 3 +++ ray-operator/controllers/ray/utils/constant.go | 1 + 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 76b21e47527..36752c4c4be 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -680,7 +680,8 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv } } else if len(headPods.Items) == 0 { originatedFrom := utils.GetCRDType(instance.Labels[utils.RayOriginatedFromCRDLabelKey]) - if originatedFrom == utils.RayJobCRD { + if originatedFrom == utils.RayJobCRD && + instance.Labels[utils.RayJobSubmissionModeLabelKey] == string(rayv1.SidecarMode) && // Recreating the head Pod if the RayCluster created by RayJob is provisioned doesn't help RayJob. // // Case 1: GCS fault tolerance is disabled @@ -692,13 +693,12 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv // // In this case, the worker Pods will not be killed by the new head Pod when it is created, but the submission ID has already been // used by the old Ray job, so the new Ray job will fail. - if meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) { - logger.Info( - "reconcilePods: Found 0 head Pods for a RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure", - "rayCluster", instance.Name, - ) - return nil - } + meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) { + logger.Info( + "reconcilePods: Found 0 head Pods for a sidecar-mode RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure", + "rayCluster", instance.Name, + ) + return nil } // Create head Pod if it does not exist. logger.Info("reconcilePods: Found 0 head Pods; creating a head Pod for the RayCluster.") diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index e09810022f0..c8663534714 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -940,6 +940,9 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra } labels[utils.RayOriginatedFromCRNameLabelKey] = rayJobInstance.Name labels[utils.RayOriginatedFromCRDLabelKey] = utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD) + if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { + labels[utils.RayJobSubmissionModeLabelKey] = string(rayv1.SidecarMode) + } rayCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index 3e0e86d1096..10c30d5ad06 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -26,6 +26,7 @@ const ( HashWithoutReplicasAndWorkersToDeleteKey = "ray.io/hash-without-replicas-and-workers-to-delete" NumWorkerGroupsKey = "ray.io/num-worker-groups" KubeRayVersion = "ray.io/kuberay-version" + RayJobSubmissionModeLabelKey = "ray.io/job-submission-mode" // Labels for feature RayMultihostIndexing // From 415ee29c38d51dc688e1e0c328dbda02860cf578 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Fri, 28 Nov 2025 21:26:37 +0800 Subject: [PATCH 02/34] [fix] fix CI error Signed-off-by: 400Ping --- ray-operator/controllers/ray/rayjob_controller.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index c8663534714..04e91a70da0 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -934,12 +934,13 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra } func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.RayJob, rayClusterName string) (*rayv1.RayCluster, error) { - labels := make(map[string]string, len(rayJobInstance.Labels)) + labels := map[string]string{} for key, value := range rayJobInstance.Labels { labels[key] = value } labels[utils.RayOriginatedFromCRNameLabelKey] = rayJobInstance.Name labels[utils.RayOriginatedFromCRDLabelKey] = utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD) + labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode) if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { labels[utils.RayJobSubmissionModeLabelKey] = string(rayv1.SidecarMode) } @@ -1023,7 +1024,7 @@ func (r *RayJobReconciler) checkSubmitterAndUpdateStatusIfNeeded(ctx context.Con // If head pod is deleted, mark the RayJob as failed shouldUpdate = true rayJob.Status.JobDeploymentStatus = rayv1.JobDeploymentStatusFailed - rayJob.Status.Reason = rayv1.AppFailed + rayJob.Status.Reason = rayv1.SubmissionFailed rayJob.Status.Message = "Ray head pod not found." return } From 2bbf8cb506630f1c8ce4e2a5cab9944c5ee764b1 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Mon, 15 Dec 2025 23:38:16 +0800 Subject: [PATCH 03/34] update Signed-off-by: 400Ping --- .../controllers/ray/raycluster_controller.go | 14 ++++-- .../controllers/ray/rayjob_controller.go | 44 +++++++++++++++++++ .../controllers/ray/utils/constant.go | 2 + .../e2erayjob/rayjob_sidecar_mode_test.go | 25 +++++++++++ 4 files changed, 82 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 36752c4c4be..882472bc6e4 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -679,9 +679,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv return errstd.New(reason) } } else if len(headPods.Items) == 0 { - originatedFrom := utils.GetCRDType(instance.Labels[utils.RayOriginatedFromCRDLabelKey]) - if originatedFrom == utils.RayJobCRD && - instance.Labels[utils.RayJobSubmissionModeLabelKey] == string(rayv1.SidecarMode) && + if shouldSkipHeadPodRestart(instance) && // Recreating the head Pod if the RayCluster created by RayJob is provisioned doesn't help RayJob. // // Case 1: GCS fault tolerance is disabled @@ -1094,6 +1092,16 @@ func (r *RayClusterReconciler) reconcileMultiHostWorkerGroup(ctx context.Context return nil } +func shouldSkipHeadPodRestart(cluster *rayv1.RayCluster) bool { + if cluster == nil { + return false + } + if utils.GetCRDType(cluster.Labels[utils.RayOriginatedFromCRDLabelKey]) != utils.RayJobCRD { + return false + } + return cluster.Labels[utils.RayJobDisableHeadNodeRestartLabelKey] == "true" +} + // shouldDeletePod returns whether the Pod should be deleted and the reason // // @param pod: The Pod to be checked. diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 04e91a70da0..a24395ac783 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -109,6 +109,18 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) return ctrl.Result{}, nil } + if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode && + rayJobInstance.ObjectMeta.DeletionTimestamp.IsZero() && + addSidecarDisableHeadRestartLabel(rayJobInstance) { + logger.Info("Add label to disable head Pod recreation for sidecar-mode RayJob", + "label", utils.RayJobDisableHeadNodeRestartLabelKey) + if err := r.Update(ctx, rayJobInstance); err != nil { + logger.Error(err, "Failed to update RayJob labels") + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err + } + return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil + } + if !rayJobInstance.ObjectMeta.DeletionTimestamp.IsZero() { logger.Info("RayJob is being deleted", "DeletionTimestamp", rayJobInstance.ObjectMeta.DeletionTimestamp) // If the JobStatus is not terminal, it is possible that the Ray job is still running. This includes @@ -924,6 +936,15 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra } logger.Info("Found the associated RayCluster for RayJob", "RayCluster", rayClusterNamespacedName) + if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode && + len(rayJobInstance.Spec.ClusterSelector) == 0 && + ensureRayClusterHasSidecarDisableHeadRestartLabel(rayClusterInstance) { + if err := r.Update(ctx, rayClusterInstance); err != nil { + logger.Error(err, "Failed to update RayCluster with sidecar restart label", "RayCluster", rayClusterNamespacedName) + return nil, err + } + } + // Verify that RayJob is not in cluster selector mode first to avoid nil pointer dereference error during spec comparison. // This is checked by ensuring len(rayJobInstance.Spec.ClusterSelector) equals 0. if len(rayJobInstance.Spec.ClusterSelector) == 0 && !utils.CompareJsonStruct(rayClusterInstance.Spec, *rayJobInstance.Spec.RayClusterSpec) { @@ -943,6 +964,7 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode) if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { labels[utils.RayJobSubmissionModeLabelKey] = string(rayv1.SidecarMode) + labels[utils.RayJobDisableHeadNodeRestartLabelKey] = "true" } rayCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ @@ -977,6 +999,28 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra return rayCluster, nil } +func addSidecarDisableHeadRestartLabel(rayJob *rayv1.RayJob) bool { + if rayJob.Labels == nil { + rayJob.Labels = map[string]string{} + } + if _, exists := rayJob.Labels[utils.RayJobDisableHeadNodeRestartLabelKey]; exists { + return false + } + rayJob.Labels[utils.RayJobDisableHeadNodeRestartLabelKey] = "true" + return true +} + +func ensureRayClusterHasSidecarDisableHeadRestartLabel(rayCluster *rayv1.RayCluster) bool { + if rayCluster.Labels == nil { + rayCluster.Labels = map[string]string{} + } + if _, exists := rayCluster.Labels[utils.RayJobDisableHeadNodeRestartLabelKey]; exists { + return false + } + rayCluster.Labels[utils.RayJobDisableHeadNodeRestartLabelKey] = "true" + return true +} + func updateStatusToSuspendingIfNeeded(ctx context.Context, rayJob *rayv1.RayJob) bool { logger := ctrl.LoggerFrom(ctx) if !rayJob.Spec.Suspend { diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index 10c30d5ad06..8c47995f467 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -27,6 +27,8 @@ const ( NumWorkerGroupsKey = "ray.io/num-worker-groups" KubeRayVersion = "ray.io/kuberay-version" RayJobSubmissionModeLabelKey = "ray.io/job-submission-mode" + // RayJobDisableHeadNodeRestartLabelKey marks RayClusters created for sidecar-mode RayJobs to skip head Pod recreation after provisioning. + RayJobDisableHeadNodeRestartLabelKey = "ray.io/disable-provisioned-head-restart" // Labels for feature RayMultihostIndexing // diff --git a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go index d3f3ed6058a..78d888d10f4 100644 --- a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go +++ b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go @@ -2,6 +2,7 @@ package e2erayjob import ( "testing" + "time" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -10,6 +11,7 @@ import ( corev1ac "k8s.io/client-go/applyconfigurations/core/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" . "github.com/ray-project/kuberay/ray-operator/test/support" @@ -174,17 +176,40 @@ env_vars: g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusRunning))) + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(func(job *rayv1.RayJob) string { + return job.Labels[utils.RayJobDisableHeadNodeRestartLabelKey] + }, Equal("true"))) + // Fetch RayCluster and delete the head Pod rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) g.Expect(err).NotTo(HaveOccurred()) rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) + g.Expect(rayCluster.Labels[utils.RayJobDisableHeadNodeRestartLabelKey]).To(Equal("true")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Deleting head Pod %s/%s for RayCluster %s", headPod.Namespace, headPod.Name, rayCluster.Name) err = test.Client().Core().CoreV1().Pods(headPod.Namespace).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) + g.Eventually(func() int { + pods, listErr := test.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( + test.Ctx(), common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions()) + if listErr != nil { + return -1 + } + return len(pods.Items) + }, TestTimeoutMedium, 2*time.Second).Should(Equal(0)) + g.Consistently(func() int { + pods, listErr := test.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( + test.Ctx(), common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions()) + if listErr != nil { + return -1 + } + return len(pods.Items) + }, TestTimeoutShort, 2*time.Second).Should(Equal(0)) + // After head pod deletion, controller should mark RayJob as Failed with a specific message g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) From 2bf33f396846bf669bd5d6d4942e58708e922bd4 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Wed, 17 Dec 2025 09:29:08 +0800 Subject: [PATCH 04/34] reunite if statement Signed-off-by: 400Ping --- ray-operator/controllers/ray/raycluster_controller.go | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 882472bc6e4..e94ecab1250 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -680,17 +680,6 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv } } else if len(headPods.Items) == 0 { if shouldSkipHeadPodRestart(instance) && - // Recreating the head Pod if the RayCluster created by RayJob is provisioned doesn't help RayJob. - // - // Case 1: GCS fault tolerance is disabled - // - // In this case, the worker Pods will be killed by the new head Pod when it is created, so the new Ray job will not be running in - // a "provisioned" cluster. - // - // Case 2: GCS fault tolerance is enabled - // - // In this case, the worker Pods will not be killed by the new head Pod when it is created, but the submission ID has already been - // used by the old Ray job, so the new Ray job will fail. meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) { logger.Info( "reconcilePods: Found 0 head Pods for a sidecar-mode RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure", From a97a3b517e39b8e5510fdf2b3066ba66c7e8d30f Mon Sep 17 00:00:00 2001 From: 400Ping Date: Wed, 17 Dec 2025 15:24:22 +0800 Subject: [PATCH 05/34] update Signed-off-by: 400Ping --- ray-operator/controllers/ray/raycluster_controller.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index e94ecab1250..4f953fa8b84 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -681,6 +681,15 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv } else if len(headPods.Items) == 0 { if shouldSkipHeadPodRestart(instance) && meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) { + // Recreating the head Pod if the RayCluster created by RayJob is provisioned doesn't help RayJob. + // + // Case 1: GCS fault tolerance is disabled + // In this case, the worker Pods will be killed by the new head Pod when it is created, so the new Ray job will not be running in + // a "provisioned" cluster. + // + // Case 2: GCS fault tolerance is enabled + // In this case, the worker Pods will not be killed by the new head Pod when it is created, but the submission ID has already been + // used by the old Ray job, so the new Ray job will fail. logger.Info( "reconcilePods: Found 0 head Pods for a sidecar-mode RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure", "rayCluster", instance.Name, From c4bfd244e076c831456ae16a98aa408b6d5b0617 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Wed, 17 Dec 2025 16:54:26 +0800 Subject: [PATCH 06/34] fix ci error Signed-off-by: 400Ping --- ray-operator/controllers/ray/raycluster_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 4f953fa8b84..dd4ac4f14a9 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -826,7 +826,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv } } } - logger.Info("reconcilePods", "found existing replica indices", "group", worker.GroupName, "indices", validReplicaIndices) + logger.Info("reconcilePods: found existing replica indices", "group", worker.GroupName, "indices", validReplicaIndices) } if diff > 0 { // pods need to be added From e7499adfcf41db9415586c35d8b9e9913912ffa6 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Thu, 18 Dec 2025 23:37:41 +0800 Subject: [PATCH 07/34] fix Signed-off-by: 400Ping --- ray-operator/test/e2erayjob/rayjob_test.go | 26 +++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index 617510718ec..0b2034a9cfb 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -15,6 +15,7 @@ import ( rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" . "github.com/ray-project/kuberay/ray-operator/test/support" @@ -279,7 +280,7 @@ env_vars: To(WithTransform(RayJobReason, Equal(rayv1.DeadlineExceeded))) }) - test.T().Run("RayJob fails when head Pod is deleted when job is running", func(_ *testing.T) { + test.T().Run("RayJob recreates head Pod when deleted while running", func(_ *testing.T) { rayJobAC := rayv1ac.RayJob("delete-head-after-submit", namespace.Name). WithSpec(rayv1ac.RayJobSpec(). WithRayClusterSpec(NewRayClusterSpec()). @@ -289,6 +290,7 @@ env_vars: rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + g.Expect(rayJob.Labels).To(Not(HaveKey(utils.RayJobDisableHeadNodeRestartLabelKey))) // Wait until the RayJob's job status transitions to Running LogWithTimestamp(test.T(), "Waiting for RayJob %s/%s to be 'Running'", rayJob.Namespace, rayJob.Name) @@ -300,20 +302,28 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) + g.Expect(rayCluster.Labels).To(Not(HaveKeyWithValue(utils.RayJobDisableHeadNodeRestartLabelKey, "true"))) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Deleting head Pod %s/%s for RayCluster %s", headPod.Namespace, headPod.Name, rayCluster.Name) err = test.Client().Core().CoreV1().Pods(headPod.Namespace).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) - // After head pod deletion, controller should mark RayJob as Failed with a specific message - g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). - Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) + // Head pod should be recreated for non-sidecar modes and the RayJob should keep running/finish. + g.Eventually(func() int { + pods, listErr := test.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( + test.Ctx(), common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions()) + if listErr != nil { + return -1 + } + return len(pods.Items) + }, TestTimeoutMedium, 2*time.Second).Should(Equal(1)) + g.Consistently(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutShort). + ShouldNot(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). - Should(WithTransform(RayJobReason, Or( - Equal(rayv1.JobDeploymentStatusTransitionGracePeriodExceeded), - Equal(rayv1.SubmissionFailed), - ))) + Should(WithTransform(func(job *rayv1.RayJob) rayv1.JobDeploymentStatus { + return job.Status.JobDeploymentStatus + }, Or(Equal(rayv1.JobDeploymentStatusRunning), Equal(rayv1.JobDeploymentStatusComplete)))) // Cleanup err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) From 714d76029ecf8c5ad0f631e440e5b1d816110c37 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Thu, 18 Dec 2025 23:50:58 +0800 Subject: [PATCH 08/34] put back unnecessary comment deletion Signed-off-by: 400Ping --- ray-operator/controllers/ray/raycluster_controller.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index dd4ac4f14a9..0438f721433 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -684,10 +684,12 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv // Recreating the head Pod if the RayCluster created by RayJob is provisioned doesn't help RayJob. // // Case 1: GCS fault tolerance is disabled + // // In this case, the worker Pods will be killed by the new head Pod when it is created, so the new Ray job will not be running in // a "provisioned" cluster. // // Case 2: GCS fault tolerance is enabled + // // In this case, the worker Pods will not be killed by the new head Pod when it is created, but the submission ID has already been // used by the old Ray job, so the new Ray job will fail. logger.Info( From 60aba9c2012ad0dc785aecc3d585f6254b2e3308 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 22 Dec 2025 13:08:39 +0800 Subject: [PATCH 09/34] Better rayjob logic Signed-off-by: Future-Outlier --- .../controllers/ray/rayjob_controller.go | 47 +------------------ 1 file changed, 2 insertions(+), 45 deletions(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index a24395ac783..66acf2e1bcc 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -109,18 +109,6 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) return ctrl.Result{}, nil } - if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode && - rayJobInstance.ObjectMeta.DeletionTimestamp.IsZero() && - addSidecarDisableHeadRestartLabel(rayJobInstance) { - logger.Info("Add label to disable head Pod recreation for sidecar-mode RayJob", - "label", utils.RayJobDisableHeadNodeRestartLabelKey) - if err := r.Update(ctx, rayJobInstance); err != nil { - logger.Error(err, "Failed to update RayJob labels") - return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err - } - return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil - } - if !rayJobInstance.ObjectMeta.DeletionTimestamp.IsZero() { logger.Info("RayJob is being deleted", "DeletionTimestamp", rayJobInstance.ObjectMeta.DeletionTimestamp) // If the JobStatus is not terminal, it is possible that the Ray job is still running. This includes @@ -916,6 +904,7 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra if err != nil { return nil, err } + if r.options.BatchSchedulerManager != nil && rayJobInstance.Spec.SubmissionMode == rayv1.K8sJobMode { if scheduler, err := r.options.BatchSchedulerManager.GetScheduler(); err == nil { // Group name is only used for individual pods to specify their task group ("headgroup", "worker-group-1", etc.). @@ -936,15 +925,6 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra } logger.Info("Found the associated RayCluster for RayJob", "RayCluster", rayClusterNamespacedName) - if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode && - len(rayJobInstance.Spec.ClusterSelector) == 0 && - ensureRayClusterHasSidecarDisableHeadRestartLabel(rayClusterInstance) { - if err := r.Update(ctx, rayClusterInstance); err != nil { - logger.Error(err, "Failed to update RayCluster with sidecar restart label", "RayCluster", rayClusterNamespacedName) - return nil, err - } - } - // Verify that RayJob is not in cluster selector mode first to avoid nil pointer dereference error during spec comparison. // This is checked by ensuring len(rayJobInstance.Spec.ClusterSelector) equals 0. if len(rayJobInstance.Spec.ClusterSelector) == 0 && !utils.CompareJsonStruct(rayClusterInstance.Spec, *rayJobInstance.Spec.RayClusterSpec) { @@ -963,7 +943,6 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra labels[utils.RayOriginatedFromCRDLabelKey] = utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD) labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode) if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { - labels[utils.RayJobSubmissionModeLabelKey] = string(rayv1.SidecarMode) labels[utils.RayJobDisableHeadNodeRestartLabelKey] = "true" } rayCluster := &rayv1.RayCluster{ @@ -999,28 +978,6 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra return rayCluster, nil } -func addSidecarDisableHeadRestartLabel(rayJob *rayv1.RayJob) bool { - if rayJob.Labels == nil { - rayJob.Labels = map[string]string{} - } - if _, exists := rayJob.Labels[utils.RayJobDisableHeadNodeRestartLabelKey]; exists { - return false - } - rayJob.Labels[utils.RayJobDisableHeadNodeRestartLabelKey] = "true" - return true -} - -func ensureRayClusterHasSidecarDisableHeadRestartLabel(rayCluster *rayv1.RayCluster) bool { - if rayCluster.Labels == nil { - rayCluster.Labels = map[string]string{} - } - if _, exists := rayCluster.Labels[utils.RayJobDisableHeadNodeRestartLabelKey]; exists { - return false - } - rayCluster.Labels[utils.RayJobDisableHeadNodeRestartLabelKey] = "true" - return true -} - func updateStatusToSuspendingIfNeeded(ctx context.Context, rayJob *rayv1.RayJob) bool { logger := ctrl.LoggerFrom(ctx) if !rayJob.Spec.Suspend { @@ -1068,7 +1025,7 @@ func (r *RayJobReconciler) checkSubmitterAndUpdateStatusIfNeeded(ctx context.Con // If head pod is deleted, mark the RayJob as failed shouldUpdate = true rayJob.Status.JobDeploymentStatus = rayv1.JobDeploymentStatusFailed - rayJob.Status.Reason = rayv1.SubmissionFailed + rayJob.Status.Reason = rayv1.AppFailed rayJob.Status.Message = "Ray head pod not found." return } From 45bb98a4b0b5dd3299ba462323e194625f80b05d Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 22 Dec 2025 13:27:40 +0800 Subject: [PATCH 10/34] update Signed-off-by: Future-Outlier --- .../controllers/ray/raycluster_controller.go | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 42ffb19ad38..26b62778646 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -679,8 +679,8 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv return errstd.New(reason) } } else if len(headPods.Items) == 0 { - if shouldSkipHeadPodRestart(instance) && - meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) { + if meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) && + shouldSkipHeadPodRestart(instance) { // Recreating the head Pod if the RayCluster created by RayJob is provisioned doesn't help RayJob. // // Case 1: GCS fault tolerance is disabled @@ -1092,14 +1092,8 @@ func (r *RayClusterReconciler) reconcileMultiHostWorkerGroup(ctx context.Context return nil } -func shouldSkipHeadPodRestart(cluster *rayv1.RayCluster) bool { - if cluster == nil { - return false - } - if utils.GetCRDType(cluster.Labels[utils.RayOriginatedFromCRDLabelKey]) != utils.RayJobCRD { - return false - } - return cluster.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] == "true" +func shouldSkipHeadPodRestart(instance *rayv1.RayCluster) bool { + return getCreatorCRDType(*instance) == utils.RayJobCRD && instance.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] == "true" } // shouldDeletePod returns whether the Pod should be deleted and the reason From 59ef8b351d584b9034c0468a3088d4e0a64c512c Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 22 Dec 2025 13:38:00 +0800 Subject: [PATCH 11/34] update Signed-off-by: Future-Outlier --- ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go index d50247b5ca6..7db58387910 100644 --- a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go +++ b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go @@ -176,11 +176,6 @@ env_vars: g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). Should(WithTransform(RayJobStatus, Equal(rayv1.JobStatusRunning))) - g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). - Should(WithTransform(func(job *rayv1.RayJob) string { - return job.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] - }, Equal("true"))) - // Fetch RayCluster and delete the head Pod rayJob, err = GetRayJob(test, rayJob.Namespace, rayJob.Name) g.Expect(err).NotTo(HaveOccurred()) From 2464704798ae955f37378556afbe77aa66c43710 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Mon, 22 Dec 2025 13:38:35 +0800 Subject: [PATCH 12/34] update Signed-off-by: Future-Outlier --- ray-operator/controllers/ray/rayjob_controller.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 1481828f377..6aad2c2a17c 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -942,9 +942,13 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra labels[utils.RayOriginatedFromCRNameLabelKey] = rayJobInstance.Name labels[utils.RayOriginatedFromCRDLabelKey] = utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD) labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode) + if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] = "true" + } else { + labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] = "false" } + rayCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, From 03ff4fe55d050604f91f87ca99b7c2ec482bd7ba Mon Sep 17 00:00:00 2001 From: Ping Date: Wed, 31 Dec 2025 21:48:53 +0800 Subject: [PATCH 13/34] Update ray-operator/test/e2erayjob/rayjob_test.go Co-authored-by: Jun-Hao Wan Signed-off-by: Ping --- ray-operator/test/e2erayjob/rayjob_test.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index cb5c27a4e8b..b617255218a 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -321,9 +321,10 @@ env_vars: g.Consistently(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutShort). ShouldNot(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). - Should(WithTransform(func(job *rayv1.RayJob) rayv1.JobDeploymentStatus { - return job.Status.JobDeploymentStatus - }, Or(Equal(rayv1.JobDeploymentStatusRunning), Equal(rayv1.JobDeploymentStatusComplete)))) + Should(WithTransform(RayJobReason, Or( + Equal(rayv1.JobDeploymentStatusTransitionGracePeriodExceeded), + Equal(rayv1.SubmissionFailed), + ))) // Cleanup err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) From 63957d1210c1d8e1e97b9ec1f4457a1b792743e6 Mon Sep 17 00:00:00 2001 From: Ping Date: Wed, 31 Dec 2025 21:49:36 +0800 Subject: [PATCH 14/34] Update ray-operator/test/e2erayjob/rayjob_test.go Co-authored-by: Nary Yeh <60069744+machichima@users.noreply.github.com> Signed-off-by: Ping --- ray-operator/test/e2erayjob/rayjob_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index b617255218a..f42f33e584d 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -302,7 +302,8 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(rayCluster.Labels).To(Not(HaveKeyWithValue(utils.RayJobDisableProvisionedHeadNodeRestartLabelKey, "true"))) + +g.Expect(rayCluster.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey]).To(Equal("false")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Deleting head Pod %s/%s for RayCluster %s", headPod.Namespace, headPod.Name, rayCluster.Name) From 3115ae4712bf791a6ed7f042ddbd8f3ae80dcb97 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Thu, 1 Jan 2026 17:48:13 +0800 Subject: [PATCH 15/34] update rayjob test Signed-off-by: 400Ping --- ray-operator/test/e2erayjob/rayjob_test.go | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index f42f33e584d..b3e4b470b4f 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -280,7 +280,7 @@ env_vars: To(WithTransform(RayJobReason, Equal(rayv1.DeadlineExceeded))) }) - test.T().Run("RayJob recreates head Pod when deleted while running", func(_ *testing.T) { + test.T().Run("RayJob controller recreates the head Pod if it is deleted while the job is running", func(_ *testing.T) { rayJobAC := rayv1ac.RayJob("delete-head-after-submit", namespace.Name). WithSpec(rayv1ac.RayJobSpec(). WithRayClusterSpec(NewRayClusterSpec()). @@ -290,7 +290,6 @@ env_vars: rayJob, err := test.Client().Ray().RayV1().RayJobs(namespace.Name).Apply(test.Ctx(), rayJobAC, TestApplyOptions) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) - g.Expect(rayJob.Labels).To(Not(HaveKey(utils.RayJobDisableProvisionedHeadNodeRestartLabelKey))) // Wait until the RayJob's job status transitions to Running LogWithTimestamp(test.T(), "Waiting for RayJob %s/%s to be 'Running'", rayJob.Namespace, rayJob.Name) @@ -302,15 +301,14 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) - -g.Expect(rayCluster.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey]).To(Equal("false")) + g.Expect(rayCluster.Labels).To(HaveKeyWithValue(utils.RayJobDisableProvisionedHeadNodeRestartLabelKey, "false")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Deleting head Pod %s/%s for RayCluster %s", headPod.Namespace, headPod.Name, rayCluster.Name) err = test.Client().Core().CoreV1().Pods(headPod.Namespace).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) - // Head pod should be recreated for non-sidecar modes and the RayJob should keep running/finish. + // Head pod should be recreated for non-sidecar modes. g.Eventually(func() int { pods, listErr := test.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( test.Ctx(), common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions()) @@ -319,13 +317,8 @@ g.Expect(rayCluster.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey } return len(pods.Items) }, TestTimeoutMedium, 2*time.Second).Should(Equal(1)) - g.Consistently(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutShort). - ShouldNot(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). - Should(WithTransform(RayJobReason, Or( - Equal(rayv1.JobDeploymentStatusTransitionGracePeriodExceeded), - Equal(rayv1.SubmissionFailed), - ))) + Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) // Cleanup err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) From a5b30a48d642f8445391c828eef06e7a2a06624d Mon Sep 17 00:00:00 2001 From: 400Ping Date: Thu, 1 Jan 2026 21:28:02 +0800 Subject: [PATCH 16/34] fix merge conflict error Signed-off-by: 400Ping --- ray-operator/controllers/ray/raycluster_controller.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index b55f67188df..962a13c7420 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -1122,6 +1122,8 @@ func (r *RayClusterReconciler) reconcileMultiHostWorkerGroup(ctx context.Context func shouldSkipHeadPodRestart(instance *rayv1.RayCluster) bool { return getCreatorCRDType(*instance) == utils.RayJobCRD && instance.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] == "true" +} + // shouldRecreatePodsForUpgrade checks if any pods need to be recreated based on RayClusterSpec changes func (r *RayClusterReconciler) shouldRecreatePodsForUpgrade(ctx context.Context, instance *rayv1.RayCluster) bool { logger := ctrl.LoggerFrom(ctx) From e77db80b47b5a9079cca67484c838a18fc0606fd Mon Sep 17 00:00:00 2001 From: Ping Date: Thu, 1 Jan 2026 22:18:31 +0800 Subject: [PATCH 17/34] Update ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go Co-authored-by: fscnick <6858627+fscnick@users.noreply.github.com> Signed-off-by: Ping --- .../e2erayjob/rayjob_sidecar_mode_test.go | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go index 7db58387910..3aae82c91d0 100644 --- a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go +++ b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go @@ -188,22 +188,13 @@ env_vars: err = test.Client().Core().CoreV1().Pods(headPod.Namespace).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) - g.Eventually(func() int { - pods, listErr := test.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( + getNumOfHeadPods := func() (int, error) { + pods, err := test.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( test.Ctx(), common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions()) - if listErr != nil { - return -1 - } - return len(pods.Items) - }, TestTimeoutMedium, 2*time.Second).Should(Equal(0)) - g.Consistently(func() int { - pods, listErr := test.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( - test.Ctx(), common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions()) - if listErr != nil { - return -1 - } - return len(pods.Items) - }, TestTimeoutShort, 2*time.Second).Should(Equal(0)) + return len(pods.Items), err + } + g.Eventually(getNumOfHeadPods, TestTimeoutMedium, 2*time.Second).Should(BeZero()) + g.Consistently(getNumOfHeadPods, TestTimeoutShort, 2*time.Second).Should(BeZero()) // After head pod deletion, controller should mark RayJob as Failed with a specific message g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). From 33afa207096076aca2420980a5650d618477b485 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Fri, 2 Jan 2026 18:03:37 +0800 Subject: [PATCH 18/34] update Signed-off-by: 400Ping --- .../e2erayjob/rayjob_sidecar_mode_test.go | 10 ++------ ray-operator/test/e2erayjob/rayjob_test.go | 10 +------- ray-operator/test/support/ray.go | 23 +++++++++++++++++++ 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go index 3aae82c91d0..eef5f90e775 100644 --- a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go +++ b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go @@ -11,7 +11,6 @@ import ( corev1ac "k8s.io/client-go/applyconfigurations/core/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" . "github.com/ray-project/kuberay/ray-operator/test/support" @@ -188,13 +187,8 @@ env_vars: err = test.Client().Core().CoreV1().Pods(headPod.Namespace).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) - getNumOfHeadPods := func() (int, error) { - pods, err := test.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( - test.Ctx(), common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions()) - return len(pods.Items), err - } - g.Eventually(getNumOfHeadPods, TestTimeoutMedium, 2*time.Second).Should(BeZero()) - g.Consistently(getNumOfHeadPods, TestTimeoutShort, 2*time.Second).Should(BeZero()) + g.Eventually(HeadPodOrNil(test, rayCluster), TestTimeoutMedium, 2*time.Second).Should(BeNil()) + g.Consistently(HeadPodOrNil(test, rayCluster), TestTimeoutShort, 2*time.Second).Should(BeNil()) // After head pod deletion, controller should mark RayJob as Failed with a specific message g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index b3e4b470b4f..0ff168543ee 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -15,7 +15,6 @@ import ( rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray" - "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" . "github.com/ray-project/kuberay/ray-operator/test/support" @@ -309,14 +308,7 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) // Head pod should be recreated for non-sidecar modes. - g.Eventually(func() int { - pods, listErr := test.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( - test.Ctx(), common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions()) - if listErr != nil { - return -1 - } - return len(pods.Items) - }, TestTimeoutMedium, 2*time.Second).Should(Equal(1)) + g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium, 2*time.Second).ShouldNot(BeNil()) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) // Cleanup diff --git a/ray-operator/test/support/ray.go b/ray-operator/test/support/ray.go index 717ffe2fc70..692730c4b6d 100644 --- a/ray-operator/test/support/ray.go +++ b/ray-operator/test/support/ray.go @@ -135,6 +135,12 @@ func HeadPod(t Test, rayCluster *rayv1.RayCluster) func() (*corev1.Pod, error) { } } +func HeadPodOrNil(t Test, rayCluster *rayv1.RayCluster) func() (*corev1.Pod, error) { + return func() (*corev1.Pod, error) { + return GetHeadPodOrNil(t, rayCluster) + } +} + func GetHeadPod(t Test, rayCluster *rayv1.RayCluster) (*corev1.Pod, error) { pods, err := t.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( t.Ctx(), @@ -149,6 +155,23 @@ func GetHeadPod(t Test, rayCluster *rayv1.RayCluster) (*corev1.Pod, error) { return &pods.Items[0], nil } +func GetHeadPodOrNil(t Test, rayCluster *rayv1.RayCluster) (*corev1.Pod, error) { + pods, err := t.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( + t.Ctx(), + common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions(), + ) + if err != nil { + return nil, err + } + if len(pods.Items) == 0 { + return nil, nil + } + if len(pods.Items) != 1 { + return nil, errors.New("number of head pods is not 1") + } + return &pods.Items[0], nil +} + func WorkerPods(t Test, rayCluster *rayv1.RayCluster) func() ([]corev1.Pod, error) { return func() ([]corev1.Pod, error) { return GetWorkerPods(t, rayCluster) From ec01312130b272efe456443f55a2bac8bc1ac96e Mon Sep 17 00:00:00 2001 From: 400Ping Date: Fri, 2 Jan 2026 23:50:57 +0800 Subject: [PATCH 19/34] revert reason assertion Signed-off-by: 400Ping --- ray-operator/test/e2erayjob/rayjob_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index 0ff168543ee..471c42c520e 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -311,6 +311,11 @@ env_vars: g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium, 2*time.Second).ShouldNot(BeNil()) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) + g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). + Should(WithTransform(RayJobReason, Or( + Equal(rayv1.JobDeploymentStatusTransitionGracePeriodExceeded), + Equal(rayv1.SubmissionFailed), + ))) // Cleanup err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Delete(test.Ctx(), rayJob.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) From 6e7c738dd009335febd567b7343bd87e44cae987 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Sat, 3 Jan 2026 08:17:03 +0800 Subject: [PATCH 20/34] [chore] retrigger ci From e155b5e40fcdbb5689d4f26292852319eb75bd86 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Sat, 3 Jan 2026 09:29:18 +0800 Subject: [PATCH 21/34] update Signed-off-by: 400Ping --- ray-operator/test/e2erayjob/rayjob_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index 471c42c520e..1e0a975bc03 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -313,6 +313,7 @@ env_vars: Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). Should(WithTransform(RayJobReason, Or( + Equal(rayv1.AppFailed), Equal(rayv1.JobDeploymentStatusTransitionGracePeriodExceeded), Equal(rayv1.SubmissionFailed), ))) From 1f9dbe88f91f324ccbf0e90d84c5d0f43195c1cf Mon Sep 17 00:00:00 2001 From: 400Ping Date: Sat, 3 Jan 2026 16:57:36 +0800 Subject: [PATCH 22/34] [chore] change from HeadPod to GetHeadPod Signed-off-by: 400Ping --- ray-operator/test/e2erayjob/rayjob_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index 1e0a975bc03..4240dbf52dc 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -308,7 +308,9 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) // Head pod should be recreated for non-sidecar modes. - g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium, 2*time.Second).ShouldNot(BeNil()) + g.Eventually(func() (*corev1.Pod, error) { + return GetHeadPod(test, rayCluster) + }, TestTimeoutMedium, 2*time.Second).ShouldNot(BeNil()) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). Should(WithTransform(RayJobDeploymentStatus, Equal(rayv1.JobDeploymentStatusFailed))) g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). From 883eb7cda26babc4c8e559d28a766a9933b5ce17 Mon Sep 17 00:00:00 2001 From: Future-Outlier Date: Sun, 4 Jan 2026 11:33:29 +0800 Subject: [PATCH 23/34] add submission mode label key label Signed-off-by: Future-Outlier --- .../test/e2erayjob/rayjob_sidecar_mode_test.go | 12 ++++++++++-- ray-operator/test/e2erayjob/rayjob_test.go | 1 + 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go index eef5f90e775..fbc574c9668 100644 --- a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go +++ b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go @@ -180,6 +180,7 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) + g.Expect(rayCluster.Labels[utils.RayJobSubmissionModeLabelKey]).To(Equal(string(rayv1.SidecarMode))) g.Expect(rayCluster.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey]).To(Equal("true")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) @@ -187,8 +188,15 @@ env_vars: err = test.Client().Core().CoreV1().Pods(headPod.Namespace).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{}) g.Expect(err).NotTo(HaveOccurred()) - g.Eventually(HeadPodOrNil(test, rayCluster), TestTimeoutMedium, 2*time.Second).Should(BeNil()) - g.Consistently(HeadPodOrNil(test, rayCluster), TestTimeoutShort, 2*time.Second).Should(BeNil()) + // Head pod should NOT be recreated for sidecar modes. + // 1. use GetHeadPod function, I want to delete GetHeadPodOrNil function, and check the + + g.Eventually(func() (*corev1.Pod, error) { + return GetHeadPodOrNil(test, rayCluster) + }, TestTimeoutMedium, 2*time.Second).Should(BeNil()) + g.Consistently(func() (*corev1.Pod, error) { + return GetHeadPodOrNil(test, rayCluster) + }, TestTimeoutShort, 2*time.Second).Should(BeNil()) // After head pod deletion, controller should mark RayJob as Failed with a specific message g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index 4240dbf52dc..46baf38d71f 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -300,6 +300,7 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) + g.Expect(rayCluster.Labels).To(HaveKeyWithValue(utils.RayJobSubmissionModeLabelKey, string(rayv1.K8sJobMode))) g.Expect(rayCluster.Labels).To(HaveKeyWithValue(utils.RayJobDisableProvisionedHeadNodeRestartLabelKey, "false")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) From 7246d3369b6827f99887758554132da89c40b000 Mon Sep 17 00:00:00 2001 From: Ping Date: Tue, 6 Jan 2026 13:18:42 +0800 Subject: [PATCH 24/34] Update ray-operator/controllers/ray/utils/constant.go Co-authored-by: Rueian Signed-off-by: Ping --- ray-operator/controllers/ray/utils/constant.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index 7ea876e5087..b0dfa6f474f 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -31,7 +31,7 @@ const ( RayCronJobTimestampAnnotationKey = "ray.io/cronjob-scheduled-timestamp" RayJobSubmissionModeLabelKey = "ray.io/job-submission-mode" // RayJobDisableProvisionedHeadNodeRestartLabelKey marks RayClusters created for sidecar-mode RayJobs to skip head Pod recreation after provisioning. - RayJobDisableProvisionedHeadNodeRestartLabelKey = "ray.io/disable-provisioned-head-restart" + DisableProvisionedHeadRestartAnnotationKey = "ray.io/disable-provisioned-head-restart" // Labels for feature RayMultihostIndexing // From 9a7eaa49a42e8e4f2c2d965236f6aedf44a1eebc Mon Sep 17 00:00:00 2001 From: Ping Date: Tue, 6 Jan 2026 13:19:01 +0800 Subject: [PATCH 25/34] Update ray-operator/controllers/ray/raycluster_controller.go Co-authored-by: Rueian Signed-off-by: Ping --- ray-operator/controllers/ray/raycluster_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index f43a2aba3f9..f6c8e925232 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -721,7 +721,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv // In this case, the worker Pods will not be killed by the new head Pod when it is created, but the submission ID has already been // used by the old Ray job, so the new Ray job will fail. logger.Info( - "reconcilePods: Found 0 head Pods for a sidecar-mode RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure", + "reconcilePods: Found 0 head Pods for the RayCluster; Skipped head recreation due to ray.io/disable-provisioned-head-restart", "rayCluster", instance.Name, ) return nil From d02c6a750bca1e10c5ed3c35718034306aae76c4 Mon Sep 17 00:00:00 2001 From: Ping Date: Tue, 6 Jan 2026 13:19:11 +0800 Subject: [PATCH 26/34] Update ray-operator/controllers/ray/raycluster_controller.go Co-authored-by: Rueian Signed-off-by: Ping --- ray-operator/controllers/ray/raycluster_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index f6c8e925232..111f60d7dc5 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -1121,7 +1121,7 @@ func (r *RayClusterReconciler) reconcileMultiHostWorkerGroup(ctx context.Context } func shouldSkipHeadPodRestart(instance *rayv1.RayCluster) bool { - return getCreatorCRDType(*instance) == utils.RayJobCRD && instance.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] == "true" + return instance.Annotations[utils.DisableProvisionedHeadRestartAnnotationKey] == "true" } // shouldRecreatePodsForUpgrade checks if any pods need to be recreated based on RayClusterSpec changes From f3d94317191b36956618b3fd70adb81899413e24 Mon Sep 17 00:00:00 2001 From: Ping Date: Tue, 6 Jan 2026 13:19:21 +0800 Subject: [PATCH 27/34] Update ray-operator/controllers/ray/rayjob_controller.go Co-authored-by: Rueian Signed-off-by: Ping --- ray-operator/controllers/ray/rayjob_controller.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 9f362159d39..0dcf19ed03a 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -943,9 +943,7 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode) if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { - labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] = "true" - } else { - labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey] = "false" + annotations[utils.DisableProvisionedHeadRestartAnnotationKey] = "true" } rayCluster := &rayv1.RayCluster{ From a59e48652b9172814b579f6daaac4af86bd986c7 Mon Sep 17 00:00:00 2001 From: Ping Date: Wed, 7 Jan 2026 20:51:27 +0800 Subject: [PATCH 28/34] Update ray-operator/controllers/ray/rayjob_controller.go Co-authored-by: Rueian Signed-off-by: Ping --- ray-operator/controllers/ray/rayjob_controller.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 0dcf19ed03a..ecd1c90c903 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -940,8 +940,6 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra maps.Copy(labels, rayJobInstance.Labels) labels[utils.RayOriginatedFromCRNameLabelKey] = rayJobInstance.Name labels[utils.RayOriginatedFromCRDLabelKey] = utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD) - labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode) - if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { annotations[utils.DisableProvisionedHeadRestartAnnotationKey] = "true" } From 729cf0c1309461cf8cc76a4154e3174057aa9c32 Mon Sep 17 00:00:00 2001 From: Ping Date: Wed, 7 Jan 2026 20:54:18 +0800 Subject: [PATCH 29/34] Update ray-operator/controllers/ray/utils/constant.go Co-authored-by: Nary Yeh <60069744+machichima@users.noreply.github.com> Signed-off-by: Ping --- ray-operator/controllers/ray/utils/constant.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index b0dfa6f474f..e41ce1b10ee 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -30,7 +30,7 @@ const ( RayCronJobNameLabelKey = "ray.io/cronjob-name" RayCronJobTimestampAnnotationKey = "ray.io/cronjob-scheduled-timestamp" RayJobSubmissionModeLabelKey = "ray.io/job-submission-mode" - // RayJobDisableProvisionedHeadNodeRestartLabelKey marks RayClusters created for sidecar-mode RayJobs to skip head Pod recreation after provisioning. + // DisableProvisionedHeadRestartAnnotationKey marks RayClusters created for sidecar-mode RayJobs to skip head Pod recreation after provisioning. DisableProvisionedHeadRestartAnnotationKey = "ray.io/disable-provisioned-head-restart" // Labels for feature RayMultihostIndexing From 8d07eced7f2f5cd699a0e19635a7a970792df35d Mon Sep 17 00:00:00 2001 From: Ping Date: Wed, 7 Jan 2026 20:55:28 +0800 Subject: [PATCH 30/34] Update ray-operator/controllers/ray/rayjob_controller.go Co-authored-by: Rueian Signed-off-by: Ping --- ray-operator/controllers/ray/rayjob_controller.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index ecd1c90c903..6df24df6e8c 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -938,8 +938,6 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.RayJob, rayClusterName string) (*rayv1.RayCluster, error) { labels := make(map[string]string, len(rayJobInstance.Labels)) maps.Copy(labels, rayJobInstance.Labels) - labels[utils.RayOriginatedFromCRNameLabelKey] = rayJobInstance.Name - labels[utils.RayOriginatedFromCRDLabelKey] = utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD) if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { annotations[utils.DisableProvisionedHeadRestartAnnotationKey] = "true" } From 04caf77bb9dfbb9791a8a6522918d79a58840067 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Wed, 7 Jan 2026 21:18:35 +0800 Subject: [PATCH 31/34] update Signed-off-by: 400Ping --- .../controllers/ray/rayjob_controller.go | 8 +++++-- .../e2erayjob/rayjob_sidecar_mode_test.go | 18 +++++++-------- ray-operator/test/e2erayjob/rayjob_test.go | 2 +- ray-operator/test/support/ray.go | 23 ------------------- 4 files changed, 16 insertions(+), 35 deletions(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 6df24df6e8c..5366a7a275e 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -936,8 +936,12 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra } func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.RayJob, rayClusterName string) (*rayv1.RayCluster, error) { - labels := make(map[string]string, len(rayJobInstance.Labels)) + labels := make(map[string]string, len(rayJobInstance.Labels)+1) maps.Copy(labels, rayJobInstance.Labels) + labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode) + + annotations := make(map[string]string, len(rayJobInstance.Annotations)+1) + maps.Copy(annotations, rayJobInstance.Annotations) if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { annotations[utils.DisableProvisionedHeadRestartAnnotationKey] = "true" } @@ -945,7 +949,7 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra rayCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, - Annotations: rayJobInstance.Annotations, + Annotations: annotations, Name: rayClusterName, Namespace: rayJobInstance.Namespace, }, diff --git a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go index fbc574c9668..62f6a3c7d33 100644 --- a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go +++ b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go @@ -181,7 +181,7 @@ env_vars: rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) g.Expect(rayCluster.Labels[utils.RayJobSubmissionModeLabelKey]).To(Equal(string(rayv1.SidecarMode))) - g.Expect(rayCluster.Labels[utils.RayJobDisableProvisionedHeadNodeRestartLabelKey]).To(Equal("true")) + g.Expect(rayCluster.Annotations[utils.DisableProvisionedHeadRestartAnnotationKey]).To(Equal("true")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Deleting head Pod %s/%s for RayCluster %s", headPod.Namespace, headPod.Name, rayCluster.Name) @@ -189,14 +189,14 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) // Head pod should NOT be recreated for sidecar modes. - // 1. use GetHeadPod function, I want to delete GetHeadPodOrNil function, and check the - - g.Eventually(func() (*corev1.Pod, error) { - return GetHeadPodOrNil(test, rayCluster) - }, TestTimeoutMedium, 2*time.Second).Should(BeNil()) - g.Consistently(func() (*corev1.Pod, error) { - return GetHeadPodOrNil(test, rayCluster) - }, TestTimeoutShort, 2*time.Second).Should(BeNil()) + g.Eventually(func() error { + _, err := GetHeadPod(test, rayCluster) + return err + }, TestTimeoutMedium, 2*time.Second).Should(HaveOccurred()) + g.Consistently(func() error { + _, err := GetHeadPod(test, rayCluster) + return err + }, TestTimeoutShort, 2*time.Second).Should(HaveOccurred()) // After head pod deletion, controller should mark RayJob as Failed with a specific message g.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutMedium). diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index 46baf38d71f..26bfebbc924 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -301,7 +301,7 @@ env_vars: rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) g.Expect(rayCluster.Labels).To(HaveKeyWithValue(utils.RayJobSubmissionModeLabelKey, string(rayv1.K8sJobMode))) - g.Expect(rayCluster.Labels).To(HaveKeyWithValue(utils.RayJobDisableProvisionedHeadNodeRestartLabelKey, "false")) + g.Expect(rayCluster.Annotations[utils.DisableProvisionedHeadRestartAnnotationKey]).To(Equal("")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Deleting head Pod %s/%s for RayCluster %s", headPod.Namespace, headPod.Name, rayCluster.Name) diff --git a/ray-operator/test/support/ray.go b/ray-operator/test/support/ray.go index 692730c4b6d..717ffe2fc70 100644 --- a/ray-operator/test/support/ray.go +++ b/ray-operator/test/support/ray.go @@ -135,12 +135,6 @@ func HeadPod(t Test, rayCluster *rayv1.RayCluster) func() (*corev1.Pod, error) { } } -func HeadPodOrNil(t Test, rayCluster *rayv1.RayCluster) func() (*corev1.Pod, error) { - return func() (*corev1.Pod, error) { - return GetHeadPodOrNil(t, rayCluster) - } -} - func GetHeadPod(t Test, rayCluster *rayv1.RayCluster) (*corev1.Pod, error) { pods, err := t.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( t.Ctx(), @@ -155,23 +149,6 @@ func GetHeadPod(t Test, rayCluster *rayv1.RayCluster) (*corev1.Pod, error) { return &pods.Items[0], nil } -func GetHeadPodOrNil(t Test, rayCluster *rayv1.RayCluster) (*corev1.Pod, error) { - pods, err := t.Client().Core().CoreV1().Pods(rayCluster.Namespace).List( - t.Ctx(), - common.RayClusterHeadPodsAssociationOptions(rayCluster).ToMetaV1ListOptions(), - ) - if err != nil { - return nil, err - } - if len(pods.Items) == 0 { - return nil, nil - } - if len(pods.Items) != 1 { - return nil, errors.New("number of head pods is not 1") - } - return &pods.Items[0], nil -} - func WorkerPods(t Test, rayCluster *rayv1.RayCluster) func() ([]corev1.Pod, error) { return func() ([]corev1.Pod, error) { return GetWorkerPods(t, rayCluster) From c0d916de5d1d19a4590990e5192aeb996deeb8e6 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Wed, 7 Jan 2026 21:40:44 +0800 Subject: [PATCH 32/34] Add missing label Signed-off-by: 400Ping --- ray-operator/controllers/ray/rayjob_controller.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 5366a7a275e..42c70330cc5 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -936,8 +936,10 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra } func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.RayJob, rayClusterName string) (*rayv1.RayCluster, error) { - labels := make(map[string]string, len(rayJobInstance.Labels)+1) + labels := make(map[string]string, len(rayJobInstance.Labels)+3) maps.Copy(labels, rayJobInstance.Labels) + labels[utils.RayOriginatedFromCRNameLabelKey] = rayJobInstance.Name + labels[utils.RayOriginatedFromCRDLabelKey] = utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD) labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode) annotations := make(map[string]string, len(rayJobInstance.Annotations)+1) From 0d43eee51c945dedfacfd86e08e65276969a61b8 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Mon, 12 Jan 2026 15:56:56 +0800 Subject: [PATCH 33/34] update Signed-off-by: 400Ping --- ray-operator/controllers/ray/rayjob_controller.go | 4 ++-- ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index 42c70330cc5..441af01edd9 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -936,13 +936,13 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra } func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.RayJob, rayClusterName string) (*rayv1.RayCluster, error) { - labels := make(map[string]string, len(rayJobInstance.Labels)+3) + labels := make(map[string]string, len(rayJobInstance.Labels)) maps.Copy(labels, rayJobInstance.Labels) labels[utils.RayOriginatedFromCRNameLabelKey] = rayJobInstance.Name labels[utils.RayOriginatedFromCRDLabelKey] = utils.RayOriginatedFromCRDLabelValue(utils.RayJobCRD) labels[utils.RayJobSubmissionModeLabelKey] = string(rayJobInstance.Spec.SubmissionMode) - annotations := make(map[string]string, len(rayJobInstance.Annotations)+1) + annotations := make(map[string]string, len(rayJobInstance.Annotations)) maps.Copy(annotations, rayJobInstance.Annotations) if rayJobInstance.Spec.SubmissionMode == rayv1.SidecarMode { annotations[utils.DisableProvisionedHeadRestartAnnotationKey] = "true" diff --git a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go index 62f6a3c7d33..125b27660c9 100644 --- a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go +++ b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go @@ -180,7 +180,7 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(rayCluster.Labels[utils.RayJobSubmissionModeLabelKey]).To(Equal(string(rayv1.SidecarMode))) + g.Expect(rayCluster.Labels).To(HaveKeyWithValue(utils.RayJobSubmissionModeLabelKey, string(rayv1.SidecarMode))) g.Expect(rayCluster.Annotations[utils.DisableProvisionedHeadRestartAnnotationKey]).To(Equal("true")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) From 03d10a4b8d1374ffed095dd5682d024910a8b993 Mon Sep 17 00:00:00 2001 From: 400Ping Date: Mon, 12 Jan 2026 16:01:24 +0800 Subject: [PATCH 34/34] update Signed-off-by: 400Ping --- ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go | 2 +- ray-operator/test/e2erayjob/rayjob_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go index 125b27660c9..62f6a3c7d33 100644 --- a/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go +++ b/ray-operator/test/e2erayjob/rayjob_sidecar_mode_test.go @@ -180,7 +180,7 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(rayCluster.Labels).To(HaveKeyWithValue(utils.RayJobSubmissionModeLabelKey, string(rayv1.SidecarMode))) + g.Expect(rayCluster.Labels[utils.RayJobSubmissionModeLabelKey]).To(Equal(string(rayv1.SidecarMode))) g.Expect(rayCluster.Annotations[utils.DisableProvisionedHeadRestartAnnotationKey]).To(Equal("true")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred()) diff --git a/ray-operator/test/e2erayjob/rayjob_test.go b/ray-operator/test/e2erayjob/rayjob_test.go index 26bfebbc924..cb712a24fc7 100644 --- a/ray-operator/test/e2erayjob/rayjob_test.go +++ b/ray-operator/test/e2erayjob/rayjob_test.go @@ -300,7 +300,7 @@ env_vars: g.Expect(err).NotTo(HaveOccurred()) rayCluster, err := GetRayCluster(test, rayJob.Namespace, rayJob.Status.RayClusterName) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(rayCluster.Labels).To(HaveKeyWithValue(utils.RayJobSubmissionModeLabelKey, string(rayv1.K8sJobMode))) + g.Expect(rayCluster.Labels[utils.RayJobSubmissionModeLabelKey]).To(Equal(string(rayv1.K8sJobMode))) g.Expect(rayCluster.Annotations[utils.DisableProvisionedHeadRestartAnnotationKey]).To(Equal("")) headPod, err := GetHeadPod(test, rayCluster) g.Expect(err).NotTo(HaveOccurred())