[https://nvbugs/5527655][feat] Add NUMA-aware CPU affinity autoconfiguration

mojombo · mojombo · commit ba2021a5153c · 2025-10-30T10:25:23.000-07:00
Signed-off-by: Dan Hansen &lt;1+dhansen-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py
@@ -2,11 +2,13 @@
 import datetime
 import enum
 import json
+import os
 import weakref
 from pathlib import Path
 from queue import Queue
 from typing import Dict, List, Optional, Tuple, Union
 
+import psutil
 import torch
 
 from tensorrt_llm.logger import logger
@@ -19,7 +21,7 @@
 from ..llmapi.llm_args import BaseLlmArgs, PybindMirror
 from ..llmapi.tokenizer import TokenizerBase
 from ..llmapi.tracer import global_tracer
-from ..llmapi.utils import _SyncQueue, logger_debug
+from ..llmapi.utils import _SyncQueue, get_numa_aware_cpu_affinity, logger_debug
 from ..lora_manager import LoraManager
 from ..metrics import RequestEventTiming
 from ..prompt_adapter_manager import PromptAdapterManager
@@ -92,13 +94,44 @@ def __init__(
         if global_mpi_size() > 1:
             logger.set_rank(self.global_rank)
 
+    def _configure_affinity(self, device_id):
+        '''
+        Probe and configure the affinity of the worker
+        '''
+
+        # Get the current affinity setting
+        pid = os.getpid()
+        process = psutil.Process(pid)
+        cpu_affinity = process.cpu_affinity()
+
+        all_cpus = list(range(psutil.cpu_count()))
+
+        constrained_affinity = (cpu_affinity != all_cpus)
+
+        # If the process is affined to a constrained set of CPUs, warn the user
+        # so as to ensure that this is what is intended
+        if constrained_affinity:
+            logger.warning(
+                f"Worker process {pid} is affined to run on the following CPUs: "
+                "{cpu_affinity} (subset of all logical CPUs). This may harm "
+                "performance if set incorrectly.", )
+
+        # If affinity is unconstrained or the user has explicitly requested it,
+        # choose the optimal affinity based upon the NUMA topology
+        if not constrained_affinity or os.environ.get(
+                "TLLM_NUMA_AWARE_WORKER_AFFINITY", "0") == "1":
+            process.cpu_affinity(get_numa_aware_cpu_affinity(device_id))
+
     def _get_comm_ranks_device_id(self):
         device_id = self.global_rank % torch.cuda.device_count()
         torch.cuda.set_device(device_id)
         # Make sure C++ executor would use same devices/ranks as py_executor
         global_rank = global_mpi_rank()
         comm_ranks = mpi_comm().allgather(global_rank)
         device_ids = mpi_comm().allgather(device_id)
+
+        self._configure_affinity(device_id)
+
         return comm_ranks, device_ids
 
     def setup_engine(self):
diff --git a/tensorrt_llm/executor/ray_gpu_worker.py b/tensorrt_llm/executor/ray_gpu_worker.py
@@ -194,6 +194,9 @@ def _get_comm_ranks_device_id(self):
 
         torch.distributed.all_gather_object(comm_ranks, global_rank)
         torch.distributed.all_gather_object(device_ids, self.device_id)
+
+        self._configure_affinity(self.device_id)
+
         return comm_ranks, device_ids
 
     def enqueue_request(self,
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -18,8 +18,7 @@
 from ..llmapi.mpi_session import set_mpi_session_cpp
 from ..llmapi.tokenizer import TokenizerBase
 from ..llmapi.tracer import VizTracer, set_global_tracer
-from ..llmapi.utils import (AsyncQueue, ManagedThread, _SyncQueue,
-                            clear_sched_affinity, logger_debug,
+from ..llmapi.utils import (AsyncQueue, ManagedThread, _SyncQueue, logger_debug,
                             print_traceback_on_error)
 from ..sampling_params import BatchedLogitsProcessor
 from .base_worker import BaseWorker
@@ -245,15 +244,6 @@ def worker_main(
     mpi_comm().barrier()
     logger_debug(f"Worker {mpi_rank()} entering worker_main...\n", "green")
 
-    pid = os.getpid()
-    cpus = os.sched_getaffinity(pid)
-    if cpus:
-        logger.warning(
-            f"Found worker process {pid} was bound to {cpus}, this may harm "
-            "performance.", )
-        logger.warning(f"Will clear the cpu affinity")
-        clear_sched_affinity(pid)
-
     result_queue: Optional[IpcQueue] = None
     result_queues: Optional[List[IpcQueue]] = None
 
diff --git a/tensorrt_llm/llmapi/utils.py b/tensorrt_llm/llmapi/utils.py
@@ -1,9 +1,11 @@
 import asyncio
 import collections
+import ctypes
 import datetime
 import hashlib
 import inspect
 import io
+import math
 import os
 import re
 import sys
@@ -513,24 +515,38 @@ def get(self, timeout=None):
                 time.sleep(0.01)
 
 
-def set_sched_setaffinity(required_cores: int):
-    ''' Set the CPU affinity of the current process to the required number of
-    cores.
-
-    Known issue: This may race with other processes that also set the affinity.
+def get_numa_aware_cpu_affinity(device_id):
+    '''
+    Given the CUDA device_id, query NVML and return the ideal CPU affinity (a
+    list of CPU ids) based upon NUMA topology
     '''
-    cpu_percentages = psutil.cpu_percent(percpu=True)
-    # sort the cores by usage
-    free_cores = sorted(range(len(cpu_percentages)),
-                        key=lambda i: cpu_percentages[i])
+    cpu_count = psutil.cpu_count()
+
+    # Get the number of bits per ulong
+    c_ulong_bits = ctypes.sizeof(ctypes.c_ulong) * 8
+
+    # Determine how large our cpu set array from NVML needs to be
+    cpu_set_size = math.ceil(cpu_count / c_ulong_bits)
+
+    # initialize NVML
+    import pynvml
+    pynvml.nvmlInit()
+
+    # Get the Ideal CPU affinity for this device
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+    affinity_masks = pynvml.nvmlDeviceGetCpuAffinity(handle, cpu_set_size)
 
-    pid = os.getpid()
-    os.sched_setaffinity(pid, set(free_cores[:required_cores]))
+    # Convert CPU masks to python list
+    cpu_affinity = []
+    for cpu_id in range(cpu_count):
+        mask_array_index = cpu_id // c_ulong_bits
+        mask_bit_index = cpu_id % c_ulong_bits
+        if affinity_masks[mask_array_index] & (1 << mask_bit_index):
+            cpu_affinity.append(cpu_id)
 
+    pynvml.nvmlShutdown()
 
-def clear_sched_affinity(pid: int):
-    ''' Clear the CPU affinity of the current process. '''
-    os.sched_setaffinity(pid, set(range(psutil.cpu_count())))
+    return cpu_affinity
 
 
 def generate_api_docs_as_docstring(model: Type[BaseModel],