Skip to content

Commit 864bb41

Browse files
sutaakaropenshift-merge-bot[bot]
authored andcommitted
Don't delete Ray head Pod when ImagePullSecret is provided
1 parent 1945b65 commit 864bb41

File tree

6 files changed

+116
-8
lines changed

6 files changed

+116
-8
lines changed

.github/workflows/e2e_tests.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ jobs:
8989
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
9090
9191
set -euo pipefail
92-
go test -timeout 60m -v -skip "^Test.*Cpu$" ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
92+
go test -timeout 120m -v -skip "^Test.*Cpu$" ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
9393
9494
- name: Print CodeFlare operator logs
9595
if: always() && steps.deploy.outcome == 'success'

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ require (
1111
github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790
1212
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c
1313
github.com/project-codeflare/appwrapper v0.30.0
14-
github.com/project-codeflare/codeflare-common v0.0.0-20241216183607-222395d38924
14+
github.com/project-codeflare/codeflare-common v0.0.0-20250117134355-5748d670cd4a
1515
github.com/ray-project/kuberay/ray-operator v1.2.1
1616
go.uber.org/zap v1.27.0
1717
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
226226
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
227227
github.com/project-codeflare/appwrapper v0.30.0 h1:tb9LJ/QmC2xyKdM0oVf+WAz9cKIGt3gllDrRgzySgyo=
228228
github.com/project-codeflare/appwrapper v0.30.0/go.mod h1:7FpO90DLv0BAq4rwZtXKS9aRRfkR9RvXsj3pgYF0HtQ=
229-
github.com/project-codeflare/codeflare-common v0.0.0-20241216183607-222395d38924 h1:jM+gYqn8eGmUoeQLGGYxlJgXZ1gbZgB2UtpKU9z0x9s=
230-
github.com/project-codeflare/codeflare-common v0.0.0-20241216183607-222395d38924/go.mod h1:DPSv5khRiRDFUD43SF8da+MrVQTWmxNhuKJmwSLOyO0=
229+
github.com/project-codeflare/codeflare-common v0.0.0-20250117134355-5748d670cd4a h1:1F5xsxncIL5Bpboup8d5osQ8iWy/hzkCTtGSBZM2tQM=
230+
github.com/project-codeflare/codeflare-common v0.0.0-20250117134355-5748d670cd4a/go.mod h1:DPSv5khRiRDFUD43SF8da+MrVQTWmxNhuKJmwSLOyO0=
231231
github.com/prometheus/client_golang v1.20.4 h1:Tgh3Yr67PaOv/uTqloMsCEdeuFTatm5zIq5+qNN23vI=
232232
github.com/prometheus/client_golang v1.20.4/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
233233
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=

pkg/controllers/raycluster_controller.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -219,8 +219,11 @@ func (r *RayClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
219219
return ctrl.Result{RequeueAfter: requeueTime}, err
220220
}
221221

222-
if err := r.deleteHeadPodIfMissingImagePullSecrets(ctx, cluster); err != nil {
223-
return ctrl.Result{RequeueAfter: requeueTime}, err
222+
if len(cluster.Spec.HeadGroupSpec.Template.Spec.ImagePullSecrets) == 0 {
223+
// Delete head pod only if user doesn't specify own imagePullSecrets and imagePullSecrets from OAuth ServiceAccount are not present in the head Pod
224+
if err := r.deleteHeadPodIfMissingImagePullSecrets(ctx, cluster); err != nil {
225+
return ctrl.Result{RequeueAfter: requeueTime}, err
226+
}
224227
}
225228

226229
_, err = r.kubeClient.RbacV1().ClusterRoleBindings().Apply(ctx, desiredOAuthClusterRoleBinding(cluster), metav1.ApplyOptions{FieldManager: controllerName, Force: true})

pkg/controllers/raycluster_controller_test.go

+67
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,73 @@ var _ = Describe("RayCluster controller", func() {
192192
}).WithTimeout(time.Second * 10).Should(Satisfy(errors.IsNotFound))
193193
})
194194

195+
It("should not delete the head pod if RayCluster CR provides image pull secrets", func(ctx SpecContext) {
196+
By("creating an instance of the RayCluster CR with imagePullSecret")
197+
rayclusterWithPullSecret := &rayv1.RayCluster{
198+
ObjectMeta: metav1.ObjectMeta{
199+
Name: "pull-secret-cluster",
200+
Namespace: namespaceName,
201+
},
202+
Spec: rayv1.RayClusterSpec{
203+
HeadGroupSpec: rayv1.HeadGroupSpec{
204+
Template: corev1.PodTemplateSpec{
205+
Spec: corev1.PodSpec{
206+
ImagePullSecrets: []corev1.LocalObjectReference{{Name: "custom-pull-secret"}},
207+
Containers: []corev1.Container{},
208+
},
209+
},
210+
RayStartParams: map[string]string{},
211+
},
212+
},
213+
}
214+
_, err := rayClient.RayV1().RayClusters(namespaceName).Create(ctx, rayclusterWithPullSecret, metav1.CreateOptions{})
215+
Expect(err).To(Not(HaveOccurred()))
216+
217+
Eventually(func() (*corev1.ServiceAccount, error) {
218+
return k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(rayclusterWithPullSecret), metav1.GetOptions{})
219+
}).WithTimeout(time.Second * 10).Should(WithTransform(OwnerReferenceKind, Equal("RayCluster")))
220+
221+
headPodName := "head-pod"
222+
headPod := &corev1.Pod{
223+
ObjectMeta: metav1.ObjectMeta{
224+
Name: headPodName,
225+
Namespace: namespaceName,
226+
Labels: map[string]string{
227+
"ray.io/node-type": "head",
228+
"ray.io/cluster": rayclusterWithPullSecret.Name,
229+
},
230+
},
231+
Spec: corev1.PodSpec{
232+
ImagePullSecrets: []corev1.LocalObjectReference{
233+
{Name: "custom-pull-secret"},
234+
},
235+
Containers: []corev1.Container{
236+
{
237+
Name: "head-container",
238+
Image: "busybox",
239+
},
240+
},
241+
},
242+
}
243+
_, err = k8sClient.CoreV1().Pods(namespaceName).Create(ctx, headPod, metav1.CreateOptions{})
244+
Expect(err).To(Not(HaveOccurred()))
245+
246+
Eventually(func() (*corev1.Pod, error) {
247+
return k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
248+
}).WithTimeout(time.Second * 10).ShouldNot(BeNil())
249+
250+
sa, err := k8sClient.CoreV1().ServiceAccounts(namespaceName).Get(ctx, oauthServiceAccountNameFromCluster(rayclusterWithPullSecret), metav1.GetOptions{})
251+
Expect(err).To(Not(HaveOccurred()))
252+
253+
sa.ImagePullSecrets = append(sa.ImagePullSecrets, corev1.LocalObjectReference{Name: "test-image-pull-secret"})
254+
_, err = k8sClient.CoreV1().ServiceAccounts(namespaceName).Update(ctx, sa, metav1.UpdateOptions{})
255+
Expect(err).To(Not(HaveOccurred()))
256+
257+
Consistently(func() (*corev1.Pod, error) {
258+
return k8sClient.CoreV1().Pods(namespaceName).Get(ctx, headPodName, metav1.GetOptions{})
259+
}).WithTimeout(time.Second * 5).Should(Not(BeNil()))
260+
})
261+
195262
It("should remove CRB when the RayCluster is deleted", func(ctx SpecContext) {
196263
foundRayCluster, err := rayClient.RayV1().RayClusters(namespaceName).Get(ctx, rayClusterName, metav1.GetOptions{})
197264
Expect(err).To(Not(HaveOccurred()))

test/e2e/mnist_rayjob_raycluster_test.go

+40-2
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,44 @@ func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, number
211211
test.Eventually(AppWrappers(test, namespace), TestTimeoutShort).Should(BeEmpty())
212212
}
213213

214+
// Verifying https://github.com/project-codeflare/codeflare-operator/issues/649
215+
func TestRayClusterImagePullSecret(t *testing.T) {
216+
test := With(t)
217+
218+
// Create a namespace
219+
namespace := test.NewTestNamespace()
220+
221+
// Create Kueue resources
222+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
223+
defer func() {
224+
_ = test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
225+
}()
226+
clusterQueue := createClusterQueue(test, resourceFlavor, 0)
227+
defer func() {
228+
_ = test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
229+
}()
230+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
231+
232+
// Create MNIST training script
233+
mnist := constructMNISTConfigMap(test, namespace)
234+
mnist, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnist, metav1.CreateOptions{})
235+
test.Expect(err).NotTo(HaveOccurred())
236+
test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name)
237+
238+
// Create RayCluster with imagePullSecret and assign it to the localqueue
239+
rayCluster := constructRayCluster(test, namespace, mnist, 0)
240+
rayCluster.Spec.HeadGroupSpec.Template.Spec.ImagePullSecrets = []corev1.LocalObjectReference{{Name: "custom-pull-secret"}}
241+
rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{})
242+
test.Expect(err).NotTo(HaveOccurred())
243+
test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name)
244+
245+
test.T().Logf("Waiting for RayCluster %s/%s to be running", rayCluster.Namespace, rayCluster.Name)
246+
test.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium).
247+
Should(WithTransform(RayClusterState, Equal(rayv1.Ready)))
248+
}
249+
250+
// Helper functions
251+
214252
func constructMNISTConfigMap(test Test, namespace *corev1.Namespace) *corev1.ConfigMap {
215253
return &corev1.ConfigMap{
216254
TypeMeta: metav1.TypeMeta{
@@ -274,11 +312,11 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf
274312
Resources: corev1.ResourceRequirements{
275313
Requests: corev1.ResourceList{
276314
corev1.ResourceCPU: resource.MustParse("250m"),
277-
corev1.ResourceMemory: resource.MustParse("512Mi"),
315+
corev1.ResourceMemory: resource.MustParse("2G"),
278316
},
279317
Limits: corev1.ResourceList{
280318
corev1.ResourceCPU: resource.MustParse("1"),
281-
corev1.ResourceMemory: resource.MustParse("2G"),
319+
corev1.ResourceMemory: resource.MustParse("4G"),
282320
},
283321
},
284322
},

0 commit comments

Comments
 (0)