Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
805369a
[RayJob] background job info poc
fscnick Oct 28, 2025
73b14b5
[RayJob] add implement some methods
fscnick Oct 28, 2025
4ce2381
[RayJob] encapsulate the worker pool
fscnick Oct 29, 2025
e184e5c
[RayJob] replace concurrency map with lru cache
fscnick Oct 29, 2025
859f6a1
[RayJob] remove cache on stop and config flag
fscnick Oct 30, 2025
03ce0e9
[RayJob] expiry cache cleanup goroutine
fscnick Oct 30, 2025
ac275c2
[RayJob] code and comment minor fix
fscnick Oct 30, 2025
0923ef5
[RayJob] task check contain or not befor add
fscnick Nov 1, 2025
9f87da6
[RayJob] remove delete cache from deleteClusterResources and add lock…
fscnick Dec 2, 2025
97ab407
[Helm] add argument for useBackgroundGoroutine
fscnick Dec 2, 2025
929a829
Merge remote-tracking branch 'upstream/master' into feat/background-g…
fscnick Dec 2, 2025
a2a0961
[RayJob] repeated error did not update
fscnick Dec 3, 2025
d2173bb
[RayJob] remove unused function and background goroutine observability
fscnick Dec 9, 2025
50c9b94
[RayJob] cache client support graceful shutdown
fscnick Dec 10, 2025
0bfd41e
Merge remote-tracking branch 'upstream/master' into feat/background-g…
fscnick Dec 10, 2025
1ab70fa
[RayJob] rename useBackgroundGoroutine to asyncJobInfoQuery
fscnick Dec 14, 2025
efb7d17
[RayJob] use ray job info in logger
fscnick Dec 14, 2025
9e71883
[RayJob] remove cacheStorage nil check
fscnick Dec 15, 2025
82085c0
[RayJob] bg goroutine uses operator context instead
fscnick Dec 15, 2025
bdb62b2
[RayJob] bg goroutine handle task queue full
fscnick Dec 15, 2025
75345e6
[RayJob] correct the comment
fscnick Dec 16, 2025
5d471c4
[RayJob] refactor initialize dashboard client for background goroutine
fscnick Dec 16, 2025
745e7a6
[RayJob] worker handle ctx.Done correctly
fscnick Dec 17, 2025
3172fbe
[RayJob] remove unnecessary putting task into queue
fscnick Dec 17, 2025
c3363c3
[RayJob] if queue is full, retry again
fscnick Dec 17, 2025
e114330
Merge remote-tracking branch 'upstream/master' into feat/background-g…
fscnick Dec 17, 2025
1fd0268
[RayJob] make cache immutable to avoid data race
fscnick Dec 19, 2025
dd6e750
[RayJob] remove unused function
fscnick Dec 20, 2025
6ec8372
[RayJob] remove cacheStorage lock
fscnick Dec 20, 2025
cf62d4d
[RayJob] update cache error
fscnick Dec 20, 2025
94729a6
[RayJob] If error on fetching job info, it removes from loop
fscnick Dec 22, 2025
88e6702
[RayJob] task queue is extendable
fscnick Dec 22, 2025
0d5dfe8
[RayJob] change slice to ring buffer
fscnick Dec 23, 2025
10ef540
Merge remote-tracking branch 'upstream/master' into feat/background-g…
fscnick Dec 23, 2025
03bc5b1
[RayJob] rename PutTask to AddTask
fscnick Dec 25, 2025
11db5a3
[RayJob] extendable channel use open source library
fscnick Dec 26, 2025
d2e13db
[RayJob] async job info query use feature gate instead
fscnick Dec 26, 2025
1f90762
[RayJob] add comment for task
fscnick Dec 26, 2025
12b6c40
Merge remote-tracking branch 'upstream/master' into feat/background-g…
fscnick Dec 26, 2025
2765c2a
[RayJob] rename function signature of worker pool init function
fscnick Dec 30, 2025
c3336f2
[RayJob] change ErrAgain error message
fscnick Dec 30, 2025
b7f649a
Merge remote-tracking branch 'upstream/master' into feat/background-g…
fscnick Dec 30, 2025
09b501c
[RayJob] fix lint error
fscnick Dec 30, 2025
5d5bbde
[RayJob] change back to EAGAIN
fscnick Dec 30, 2025
8475d1f
[RayJob] remove queue size from todo comment
fscnick Jan 1, 2026
a52f2a9
[RayJob] rename queue full error
fscnick Jan 1, 2026
4d4ed19
[RayJob] add lock to avoid data race
fscnick Jan 1, 2026
17b75d1
[RayJob] requeue check context has canceled or not
fscnick Jan 7, 2026
67df232
[RayJob] add cluster name on the cache key
fscnick Jan 7, 2026
314df45
[RayJob] check raycluster is nil or not when initializing the dashboa…
fscnick Jan 7, 2026
e3cbe9f
[RayJob] avoid send to a block channel when graceful shutdown
fscnick Jan 7, 2026
f0a3b80
[RayJob] use contain to check the placeholder at the beginning of task
fscnick Jan 7, 2026
e55c2e0
[RayJob] graceful shutdown avoid panic from a nil task
fscnick Jan 8, 2026
ae8bf77
[RayJob] fix channel receive condition
fscnick Jan 8, 2026
5c7a5bb
[RayJob] fix nil rayCluster in dashboard cache client
fscnick Jan 8, 2026
8964c93
[RayJob] remove with name from log for sharing purpose
fscnick Jan 9, 2026
90d2c30
[RayJob] remove checkname to avoid collision
fscnick Jan 10, 2026
7502bc8
[RayJob] add task with blocking send
fscnick Jan 10, 2026
886c07f
[RayJob] remove unused error
fscnick Jan 11, 2026
339adf3
[RayJob] provide raycluster name if it is absent for removing cache
fscnick Jan 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apiserver/pkg/server/ray_job_submission_service_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ type RayJobSubmissionServiceServer struct {
// Create RayJobSubmissionServiceServer
func NewRayJobSubmissionServiceServer(clusterServer *ClusterServer, options *RayJobSubmissionServiceServerOptions) *RayJobSubmissionServiceServer {
zl := zerolog.New(os.Stdout).Level(zerolog.DebugLevel)
return &RayJobSubmissionServiceServer{clusterServer: clusterServer, options: options, log: zerologr.New(&zl).WithName("jobsubmissionservice"), dashboardClientFunc: utils.GetRayDashboardClientFunc(nil, false)}
return &RayJobSubmissionServiceServer{clusterServer: clusterServer, options: options, log: zerologr.New(&zl).WithName("jobsubmissionservice"), dashboardClientFunc: utils.GetRayDashboardClientFunc(context.Background(), nil, false)}
}

// Submit Ray job
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ require (
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
Expand All @@ -99,6 +100,7 @@ require (
github.com/prometheus/procfs v0.19.2 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/smallnest/chanx v1.2.0 // indirect
github.com/x448/float16 v0.8.4 // indirect
github.com/xlab/treeprint v1.2.0 // indirect
go.opentelemetry.io/otel v1.39.0 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions ray-operator/apis/config/v1alpha1/configuration_types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package v1alpha1

import (
"context"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/manager"
Expand Down Expand Up @@ -88,8 +90,8 @@ type Configuration struct {
EnableMetrics bool `json:"enableMetrics,omitempty"`
}

func (config Configuration) GetDashboardClient(mgr manager.Manager) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error) {
return utils.GetRayDashboardClientFunc(mgr, config.UseKubernetesProxy)
func (config Configuration) GetDashboardClient(ctx context.Context, mgr manager.Manager) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error) {
return utils.GetRayDashboardClientFunc(ctx, mgr, config.UseKubernetesProxy)
}

func (config Configuration) GetHttpProxyClient(mgr manager.Manager) func(hostIp, podNamespace, podName string, port int) utils.RayHttpProxyClientInterface {
Expand Down
23 changes: 17 additions & 6 deletions ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package ray

import (
"context"
errs "errors"
"fmt"
"maps"
"os"
Expand Down Expand Up @@ -43,11 +44,10 @@ const (
// RayJobReconciler reconciles a RayJob object
type RayJobReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder

dashboardClientFunc func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error)
Recorder record.EventRecorder
options RayJobReconcilerOptions
Scheme *runtime.Scheme
dashboardClientFunc func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error)
}

type RayJobReconcilerOptions struct {
Expand All @@ -56,8 +56,8 @@ type RayJobReconcilerOptions struct {
}

// NewRayJobReconciler returns a new reconcile.Reconciler
func NewRayJobReconciler(_ context.Context, mgr manager.Manager, options RayJobReconcilerOptions, provider utils.ClientProvider) *RayJobReconciler {
dashboardClientFunc := provider.GetDashboardClient(mgr)
func NewRayJobReconciler(ctx context.Context, mgr manager.Manager, options RayJobReconcilerOptions, provider utils.ClientProvider) *RayJobReconciler {
dashboardClientFunc := provider.GetDashboardClient(ctx, mgr)
return &RayJobReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Expand Down Expand Up @@ -119,6 +119,13 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
rayClusterInstance := &rayv1.RayCluster{}
if err := r.Get(ctx, rayClusterNamespacedName, rayClusterInstance); err != nil {
logger.Error(err, "Failed to get RayCluster")

if features.Enabled(features.AsyncJobInfoQuery) {
// If the RayCluster is already deleted, we provide the name and namespace to the RayClusterInstance
// for the dashboard client to remove cache correctly.
rayClusterInstance.Name = rayClusterNamespacedName.Name
rayClusterInstance.Namespace = rayClusterNamespacedName.Namespace
}
}

rayDashboardClient, err := r.dashboardClientFunc(rayClusterInstance, rayJobInstance.Status.DashboardURL)
Expand Down Expand Up @@ -289,6 +296,10 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)

jobInfo, err := rayDashboardClient.GetJobInfo(ctx, rayJobInstance.Status.JobId)
if err != nil {
if errs.Is(err, dashboardclient.ErrAgain) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the ErrAgain is a bit ambiguous, can we user NotFoundError or others?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NotFoundError is probably not a good idea. It sounds like the JobInfo doesn't actually exist in the RayCluster.
I think ErrAgain is good enough, though.

logger.Info("The Ray job info was not ready. Try again next iteration.", "JobId", rayJobInstance.Status.JobId)
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
}
// If the Ray job was not found, GetJobInfo returns a BadRequest error.
if errors.IsBadRequest(err) {
if rayJobInstance.Spec.SubmissionMode == rayv1.HTTPMode {
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ type RayServiceReconciler struct {
}

// NewRayServiceReconciler returns a new reconcile.Reconciler
func NewRayServiceReconciler(_ context.Context, mgr manager.Manager, provider utils.ClientProvider) *RayServiceReconciler {
dashboardClientFunc := provider.GetDashboardClient(mgr)
func NewRayServiceReconciler(ctx context.Context, mgr manager.Manager, provider utils.ClientProvider) *RayServiceReconciler {
dashboardClientFunc := provider.GetDashboardClient(ctx, mgr)
httpProxyClientFunc := provider.GetHttpProxyClient(mgr)
return &RayServiceReconciler{
Client: mgr.GetClient(),
Expand Down
3 changes: 2 additions & 1 deletion ray-operator/controllers/ray/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License.
package ray

import (
"context"
"os"
"path/filepath"
"testing"
Expand Down Expand Up @@ -52,7 +53,7 @@ var (

type TestClientProvider struct{}

func (testProvider TestClientProvider) GetDashboardClient(_ manager.Manager) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error) {
func (testProvider TestClientProvider) GetDashboardClient(_ context.Context, _ manager.Manager) func(rayCluster *rayv1.RayCluster, url string) (dashboardclient.RayDashboardClientInterface, error) {
return func(_ *rayv1.RayCluster, _ string) (dashboardclient.RayDashboardClientInterface, error) {
return fakeRayDashboardClient, nil
}
Expand Down
Loading
Loading