🧗 Add Ascend NPU support for vLLM server (#3286)

ji-huazhong · qgallouedec · shirinyamani · commit d42a6a4527d2 · 2025-04-16T05:09:05.000Z
Co-authored-by: Quentin Gallouédec &lt;45557362+qgallouedec@users.noreply.github.com&gt;
diff --git a/trl/extras/vllm_client.py b/trl/extras/vllm_client.py
@@ -20,7 +20,7 @@
 import torch
 from torch import nn
 
-from ..import_utils import is_requests_available, is_vllm_available
+from ..import_utils import is_requests_available, is_vllm_ascend_available, is_vllm_available
 
 
 if is_requests_available():
@@ -32,6 +32,9 @@
     from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
     from vllm.distributed.utils import StatelessProcessGroup
 
+    if is_vllm_ascend_available():
+        from vllm_ascend.distributed.device_communicators.pyhccl import PyHcclCommunicator as PyNcclCommunicator
+
 
 logger = logging.getLogger(__name__)
 
@@ -212,7 +215,7 @@ def init_communicator(self):
 
         # Set up the communication group for weight broadcasting
         pg = StatelessProcessGroup.create(host=self.host, port=self.group_port, rank=self.rank, world_size=world_size)
-        self.pynccl_comm = PyNcclCommunicator(pg, device="cuda:0")
+        self.pynccl_comm = PyNcclCommunicator(pg, device=0)
 
     def update_named_param(self, name: str, weights: torch.Tensor):
         """
@@ -231,7 +234,7 @@ def update_named_param(self, name: str, weights: torch.Tensor):
             raise Exception(f"Request failed: {response.status_code}, {response.text}")
 
         # Broadcast the weights to the other processes
-        self.pynccl_comm.broadcast(weights, src=self.rank, stream=torch.cuda.current_stream())
+        self.pynccl_comm.broadcast(weights, src=self.rank)
         self.pynccl_comm.group.barrier()
 
     def update_model_params(self, model: nn.Module):
diff --git a/trl/import_utils.py b/trl/import_utils.py
@@ -37,6 +37,7 @@
 _unsloth_available = _is_package_available("unsloth")
 _uvicorn_available = _is_package_available("uvicorn")
 _vllm_available = _is_package_available("vllm")
+_vllm_ascend_available = _is_package_available("vllm_ascend")
 _joblib_available = _is_package_available("joblib")
 
 
@@ -88,6 +89,10 @@ def is_vllm_available() -> bool:
     return _vllm_available
 
 
+def is_vllm_ascend_available() -> bool:
+    return _vllm_ascend_available
+
+
 def is_joblib_available() -> bool:
     return _joblib_available
 
diff --git a/trl/scripts/vllm_serve.py b/trl/scripts/vllm_serve.py
@@ -22,7 +22,13 @@
 import torch
 
 from trl import TrlParser
-from trl.import_utils import is_fastapi_available, is_pydantic_available, is_uvicorn_available, is_vllm_available
+from trl.import_utils import (
+    is_fastapi_available,
+    is_pydantic_available,
+    is_uvicorn_available,
+    is_vllm_ascend_available,
+    is_vllm_available,
+)
 
 
 if is_fastapi_available():
@@ -44,6 +50,10 @@
     from vllm.distributed.utils import StatelessProcessGroup
     from vllm.sampling_params import GuidedDecodingParams
 
+    if is_vllm_ascend_available():
+        from vllm_ascend.distributed.device_communicators.pyhccl import PyHcclCommunicator as PyNcclCommunicator
+
+
 logger = logging.getLogger(__name__)
 
 # We use CUDA with multiprocessing, so we must use the 'spawn' start method. Otherwise, we will get the following
@@ -114,7 +124,7 @@ def update_named_param(self, name: str, dtype: torch.dtype, shape: Sequence[int]
         weight = torch.empty(shape, dtype=dtype, device=self.device)
 
         # Use NCCL to broadcast the updated weights from the client (src) to all workers.
-        self.pynccl_comm.broadcast(weight, src=self.client_rank, stream=torch.cuda.current_stream())
+        self.pynccl_comm.broadcast(weight, src=self.client_rank)
         self.pynccl_comm.group.barrier()
 
         # Load the received weights into the model.