RHOAIENG-8098 - ClusterConfiguration should support tolerations

jiripetrlik · jiripetrlik · commit 72401c06f821 · 2025-01-06T14:13:45.000+01:00
diff --git a/src/codeflare_sdk/ray/cluster/build_ray_cluster.py b/src/codeflare_sdk/ray/cluster/build_ray_cluster.py
@@ -16,7 +16,7 @@
     This sub-module exists primarily to be used internally by the Cluster object
     (in the cluster sub-module) for RayCluster/AppWrapper generation.
 """
-from typing import Union, Tuple, Dict
+from typing import List, Union, Tuple, Dict
 from ...common import _kube_api_error_handling
 from ...common.kubernetes_cluster import get_api_client, config_check
 from kubernetes.client.exceptions import ApiException
@@ -40,6 +40,7 @@
     V1PodTemplateSpec,
     V1PodSpec,
     V1LocalObjectReference,
+    V1Toleration
 )
 
 import yaml
@@ -139,7 +140,8 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
                     "resources": head_resources,
                 },
                 "template": {
-                    "spec": get_pod_spec(cluster, [get_head_container_spec(cluster)])
+                    "spec": get_pod_spec(cluster, [get_head_container_spec(cluster)],
+                                         cluster.config.head_tolerations)
                 },
             },
             "workerGroupSpecs": [
@@ -154,7 +156,8 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
                         "resources": worker_resources,
                     },
                     "template": V1PodTemplateSpec(
-                        spec=get_pod_spec(cluster, [get_worker_container_spec(cluster)])
+                        spec=get_pod_spec(cluster, [get_worker_container_spec(cluster)],
+                                          cluster.config.tolerations)
                     ),
                 }
             ],
@@ -243,13 +246,14 @@ def update_image(image) -> str:
     return image
 
 
-def get_pod_spec(cluster: "codeflare_sdk.ray.cluster.Cluster", containers):
+def get_pod_spec(cluster: "codeflare_sdk.ray.cluster.Cluster", containers, tolerations):
     """
     The get_pod_spec() function generates a V1PodSpec for the head/worker containers
     """
     pod_spec = V1PodSpec(
         containers=containers,
         volumes=VOLUMES,
+        tolerations=tolerations
     )
     if cluster.config.image_pull_secrets != []:
         pod_spec.image_pull_secrets = generate_image_pull_secrets(cluster)
diff --git a/src/codeflare_sdk/ray/cluster/config.py b/src/codeflare_sdk/ray/cluster/config.py
@@ -22,6 +22,7 @@
 import warnings
 from dataclasses import dataclass, field, fields
 from typing import Dict, List, Optional, Union, get_args, get_origin
+from kubernetes.client import V1Toleration
 
 dir = pathlib.Path(__file__).parent.parent.resolve()
 
@@ -57,6 +58,8 @@ class ClusterConfiguration:
             The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests)
         head_extended_resource_requests:
             A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1}
+        head_tolerations:
+            List of tolerations for head nodes.
         min_cpus:
             The minimum number of CPUs to allocate to each worker.
         max_cpus:
@@ -69,6 +72,8 @@ class ClusterConfiguration:
             The maximum amount of memory to allocate to each worker.
         num_gpus:
             The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests)
+        tolerations:
+            List of tolerations for worker nodes.
         appwrapper:
             A boolean indicating whether to use an AppWrapper.
         envs:
@@ -105,6 +110,7 @@ class ClusterConfiguration:
     head_extended_resource_requests: Dict[str, Union[str, int]] = field(
         default_factory=dict
     )
+    head_tolerations: Optional[List[V1Toleration]]
     worker_cpu_requests: Union[int, str] = 1
     worker_cpu_limits: Union[int, str] = 1
     min_cpus: Optional[Union[int, str]] = None  # Deprecating
@@ -115,6 +121,7 @@ class ClusterConfiguration:
     min_memory: Optional[Union[int, str]] = None  # Deprecating
     max_memory: Optional[Union[int, str]] = None  # Deprecating
     num_gpus: Optional[int] = None  # Deprecating
+    tolerations: Optional[List[V1Toleration]]
     appwrapper: bool = False
     envs: Dict[str, str] = field(default_factory=dict)
     image: str = ""