-
Notifications
You must be signed in to change notification settings - Fork 109
Add Parent Queue Quota Checking to Scheduler. #112
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,29 +1,49 @@ | ||
| // Copyright 2025 NVIDIA CORPORATION | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| // Package capacity_policy implements queue capacity and quota checking functionality | ||
| // for the KAI scheduler. It ensures that jobs do not exceed their queue's resource | ||
| // quotas, both at the direct queue level and parent queue levels. | ||
| package capacity_policy | ||
|
|
||
| import ( | ||
| "fmt" | ||
|
|
||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api" | ||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info" | ||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/node_info" | ||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_info" | ||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/pod_status" | ||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/podgroup_info" | ||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/resource_info" | ||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/constants" | ||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/framework" | ||
| "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/log" | ||
| rs "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/plugins/proportion/resource_share" | ||
| ) | ||
|
|
||
| // capacityCheckFn is a function type that checks if a job's requested resources | ||
| // exceed capacity limits. It returns a SchedulableResult indicating whether the | ||
| // job can be scheduled. | ||
| type capacityCheckFn func(requestedShare rs.ResourceQuantities, job *podgroup_info.PodGroupInfo) *api.SchedulableResult | ||
|
|
||
| // CapacityPolicy implements queue capacity checking and quota enforcement. | ||
| // It tracks queue hierarchies and ensures jobs don't exceed resource quotas | ||
| // at any level in the hierarchy. | ||
| type CapacityPolicy struct { | ||
| queues map[common_info.QueueID]*rs.QueueAttributes | ||
| isInferencePreemptible bool | ||
| } | ||
|
|
||
| // New creates a new CapacityPolicy instance with the given queue attributes | ||
| // and inference preemption configuration. | ||
| func New(queues map[common_info.QueueID]*rs.QueueAttributes, isInferencePreemptible bool) *CapacityPolicy { | ||
| return &CapacityPolicy{queues, isInferencePreemptible} | ||
| } | ||
|
|
||
| // IsJobOverQueueCapacity checks if a job would exceed its queue's capacity | ||
| // when considering all tasks that need to be allocated. This includes both | ||
| // regular capacity limits and non-preemptible quota checks. | ||
| func (cp *CapacityPolicy) IsJobOverQueueCapacity(job *podgroup_info.PodGroupInfo, | ||
| tasksToAllocate []*pod_info.PodInfo) *api.SchedulableResult { | ||
| requiredQuota := getRequiredQuota(tasksToAllocate) | ||
|
|
@@ -36,6 +56,9 @@ func (cp *CapacityPolicy) IsJobOverQueueCapacity(job *podgroup_info.PodGroupInfo | |
| return cp.isJobOverCapacity(requestedShareQuantities, job, checkFns) | ||
| } | ||
|
|
||
| // IsNonPreemptibleJobOverQuota specifically checks if a non-preemptible job | ||
| // would exceed its queue's quota. This is a stricter check than regular | ||
| // capacity checking as non-preemptible jobs have dedicated resource quotas. | ||
| func (cp *CapacityPolicy) IsNonPreemptibleJobOverQuota(job *podgroup_info.PodGroupInfo, | ||
| tasksToAllocate []*pod_info.PodInfo) *api.SchedulableResult { | ||
|
|
||
|
|
@@ -49,6 +72,9 @@ func (cp *CapacityPolicy) IsNonPreemptibleJobOverQuota(job *podgroup_info.PodGro | |
| return cp.isJobOverCapacity(requestedShareQuantities, job, checkFns) | ||
| } | ||
|
|
||
| // IsTaskAllocationOnNodeOverCapacity checks if allocating a specific task | ||
| // to a node would exceed capacity limits. This considers both the node's | ||
| // resources and the queue's capacity constraints. | ||
| func (cp *CapacityPolicy) IsTaskAllocationOnNodeOverCapacity(task *pod_info.PodInfo, job *podgroup_info.PodGroupInfo, | ||
| node *node_info.NodeInfo) *api.SchedulableResult { | ||
| requiredInitQuota := node.GetRequiredInitQuota(task) | ||
|
|
@@ -61,6 +87,8 @@ func (cp *CapacityPolicy) IsTaskAllocationOnNodeOverCapacity(task *pod_info.PodI | |
| return cp.isJobOverCapacity(requestedShare, job, checkFns) | ||
| } | ||
|
|
||
| // isJobOverCapacity is an internal helper that runs a series of capacity | ||
| // check functions to determine if a job exceeds any resource limits. | ||
| func (cp *CapacityPolicy) isJobOverCapacity(requestedShare rs.ResourceQuantities, job *podgroup_info.PodGroupInfo, | ||
| checkFns []capacityCheckFn) *api.SchedulableResult { | ||
| for _, checkFn := range checkFns { | ||
|
|
@@ -74,6 +102,8 @@ func (cp *CapacityPolicy) isJobOverCapacity(requestedShare rs.ResourceQuantities | |
| return Schedulable() | ||
| } | ||
|
|
||
| // getRequiredQuota calculates the total resource requirements for a set of tasks. | ||
| // This includes CPU, Memory, and GPU resources. | ||
| func getRequiredQuota(tasksToAllocate []*pod_info.PodInfo) *podgroup_info.JobRequirement { | ||
| quota := podgroup_info.JobRequirement{} | ||
| for _, pod := range tasksToAllocate { | ||
|
|
@@ -83,3 +113,131 @@ func getRequiredQuota(tasksToAllocate []*pod_info.PodInfo) *podgroup_info.JobReq | |
| } | ||
| return "a | ||
| } | ||
|
|
||
| // getFirstPendingPod returns the first pod in a job that is in Pending status. | ||
| // This is used to avoid duplicate quota checks for the same job. | ||
| func getFirstPendingPod(job *podgroup_info.PodGroupInfo) *pod_info.PodInfo { | ||
| for _, pod := range job.PodInfos { | ||
| if pod.Status == pod_status.Pending { | ||
| return pod | ||
| } | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| // OnSessionOpen is called when a new scheduling session begins. It registers | ||
| // the early quota checking function that prevents jobs from being considered | ||
| // for scheduling if they would exceed their parent queues' quotas. | ||
| func (cp *CapacityPolicy) OnSessionOpen(ssn *framework.Session) { | ||
|
||
| // Register early quota checks | ||
| ssn.AddPrePredicateFn(func(task *pod_info.PodInfo, job *podgroup_info.PodGroupInfo) error { | ||
| // Only check for the first pending pod to avoid duplicate checks | ||
| firstPending := getFirstPendingPod(job) | ||
|
||
| if firstPending == nil || task != firstPending { | ||
| return nil | ||
| } | ||
|
|
||
| // Check parent queue quotas | ||
| return cp.checkParentQueueQuotas(job, ssn) | ||
| }) | ||
| } | ||
|
|
||
| // checkParentQueueQuotas verifies that a job's resource requirements don't | ||
| // exceed quotas at any level in its queue hierarchy. This includes: | ||
| // - GPU quota checks | ||
| // - CPU quota checks | ||
| // - Memory quota checks | ||
| // | ||
| // The function traverses up the queue hierarchy starting from the job's | ||
| // immediate parent queue. If any quota would be exceeded, it returns an | ||
| // error with a detailed message. | ||
| // | ||
| // Note: Preemptible jobs (PriorityTrainNumber) are allowed to exceed parent | ||
| // queue quotas, while non-preemptible jobs must strictly adhere to quotas. | ||
| func (cp *CapacityPolicy) checkParentQueueQuotas(job *podgroup_info.PodGroupInfo, ssn *framework.Session) error { | ||
| // Skip quota checks for preemptible jobs | ||
| if job.Priority == constants.PriorityTrainNumber { | ||
|
||
| log.InfraLogger.V(5).Infof("Job: <%v/%v> is preemptible, skipping parent queue quota checks", job.Namespace, job.Name) | ||
| return nil | ||
| } | ||
|
|
||
| // Get queue info for this job | ||
| queue, found := ssn.Queues[job.Queue] | ||
| if !found { | ||
| return nil // Can't check quota without queue info | ||
| } | ||
|
|
||
| // Only check parent queues, not the job's direct queue | ||
| currentQueueID := queue.ParentQueue | ||
|
|
||
| for currentQueueID != "" { | ||
| parentQueue, found := ssn.Queues[currentQueueID] | ||
| if !found { | ||
| break | ||
| } | ||
|
||
|
|
||
| // Calculate job's total resource requirements | ||
| jobResources := resource_info.EmptyResource() | ||
| for _, pod := range job.PodInfos { | ||
| if pod.Status == pod_status.Pending { | ||
| jobResources.AddResourceRequirements(pod.ResReq) | ||
| } | ||
| } | ||
|
|
||
| // Check GPU quota | ||
|
||
| if parentQueue.Resources.GPU.Quota > 0 && jobResources.GPUs() > float64(parentQueue.Resources.GPU.Quota) { | ||
| errorMsg := fmt.Sprintf( | ||
| "parent queue '%s' quota has reached the allowable limit of GPUs. "+ | ||
|
||
| "Limit is %.0f GPUs, workload requested %.0f GPUs", | ||
| parentQueue.Name, | ||
| parentQueue.Resources.GPU.Quota, | ||
| jobResources.GPUs()) | ||
|
|
||
| // Record event | ||
| if firstPod := getFirstPendingPod(job); firstPod != nil { | ||
| log.InfraLogger.Warningf("Queue quota exceeded: %s", errorMsg) | ||
| } | ||
|
|
||
| return fmt.Errorf(errorMsg) | ||
| } | ||
|
|
||
| // Check CPU quota | ||
| if parentQueue.Resources.CPU.Quota > 0 && jobResources.Cpu() > float64(parentQueue.Resources.CPU.Quota) { | ||
| errorMsg := fmt.Sprintf( | ||
| "parent queue '%s' quota has reached the allowable limit of CPU. "+ | ||
| "Limit is %.0f CPU, workload requested %.0f CPU", | ||
| parentQueue.Name, | ||
| parentQueue.Resources.CPU.Quota, | ||
| jobResources.Cpu()) | ||
|
|
||
| // Record event | ||
| if firstPod := getFirstPendingPod(job); firstPod != nil { | ||
| log.InfraLogger.Warningf("Queue quota exceeded: %s", errorMsg) | ||
| } | ||
|
|
||
| return fmt.Errorf(errorMsg) | ||
| } | ||
|
|
||
| // Check Memory quota | ||
| if parentQueue.Resources.Memory.Quota > 0 && jobResources.Memory() > float64(parentQueue.Resources.Memory.Quota) { | ||
| errorMsg := fmt.Sprintf( | ||
| "parent queue '%s' quota has reached the allowable limit of Memory. "+ | ||
| "Limit is %.0f Memory, workload requested %.0f Memory", | ||
| parentQueue.Name, | ||
| parentQueue.Resources.Memory.Quota, | ||
| jobResources.Memory()) | ||
|
|
||
| // Record event | ||
| if firstPod := getFirstPendingPod(job); firstPod != nil { | ||
| log.InfraLogger.Warningf("Queue quota exceeded: %s", errorMsg) | ||
| } | ||
|
|
||
| return fmt.Errorf(errorMsg) | ||
| } | ||
|
|
||
| // Move up the hierarchy | ||
| currentQueueID = parentQueue.ParentQueue | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its a style comment, because this is a new project and all, but we don't have many obvious comments here (the comment really doesn't add anything that is not written in the next line)