sgl-project · Kangyan-Zhou · Feb 3, 2026 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
@@ -57,6 +57,7 @@
     is_cuda,
     is_hip,
     is_npu,
+    is_xpu,
 )
 from sglang.srt.utils.patch_torch import register_fake_if_exists
 
@@ -69,6 +70,7 @@
 _is_hip = is_hip()
 _is_cpu = is_cpu()
 _is_cpu_amx_available = cpu_has_amx_support()
+_is_xpu = is_xpu()
 _is_npu = is_npu()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 
@@ -85,7 +87,7 @@
     except ImportError as e:
         pass
 
-if _is_cuda or _is_hip:
+if _is_cuda or _is_hip or _is_xpu:
     from sgl_kernel import topk_softmax
 
     try:

diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
@@ -32,7 +32,7 @@
 
 from sglang.srt.entrypoints.engine import Engine
 from sglang.srt.model_loader.ci_weight_validation import ci_validate_and_clean_hf_cache
-from sglang.srt.utils import is_npu, load_image
+from sglang.srt.utils import get_device, is_npu, load_image
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
 
@@ -122,7 +122,7 @@ def _get_sentence_transformer_embedding_model(
             modules=[word_embedding_model, pooling_model], truncate_dim=matryoshka_dim
         )
 
-    return model.cuda()
+    return model.to(get_device())
 
 
 @dataclass
@@ -271,18 +271,18 @@ def start_model_process(
                 torch_dtype=torch_dtype,
                 trust_remote_code=self.trust_remote_code,
                 low_cpu_mem_usage=True,
-            ).cuda()
+            ).to(get_device())
         elif self.model_type == "embedding":
             if "gme-qwen2-vl" in model_path.lower():
                 self.model = AutoModelForVision2Seq.from_pretrained(
                     model_path,
                     torch_dtype=torch_dtype,
                     trust_remote_code=False,
                     low_cpu_mem_usage=True,
-                ).cuda()
+                ).to(get_device())
                 self.processor = AutoProcessor.from_pretrained(model_path)
             elif "clip" in model_path.lower():
-                self.model = AutoModel.from_pretrained(model_path).cuda()
+                self.model = AutoModel.from_pretrained(model_path).to(get_device())
                 self.processor = AutoProcessor.from_pretrained(model_path)
             else:
                 self.model = _get_sentence_transformer_embedding_model(
@@ -295,7 +295,7 @@ def start_model_process(
                 model_path,
                 torch_dtype=torch_dtype,
                 trust_remote_code=self.needs_trust_remote_code(model_path),
-            ).cuda()
+            ).to(get_device())
         else:
             raise Exception(f"Unrecognized model type {self.model_type}")
         self.tokenizer = get_tokenizer(
@@ -338,23 +338,27 @@ def start_model_process(
                                 images=image[0], return_tensors="pt"
                             )
                             logits = self.model.get_image_features(
-                                pixel_values=inputs.data["pixel_values"].cuda(),
+                                pixel_values=inputs.data["pixel_values"].to(
+                                    get_device()
+                                ),
                             ).tolist()
                         else:
                             inputs = self.tokenizer(
                                 prompts, padding=True, return_tensors="pt"
                             )
                             logits = self.model.get_text_features(
-                                input_ids=inputs.data["input_ids"].cuda(),
-                                attention_mask=inputs.data["attention_mask"].cuda(),
+                                input_ids=inputs.data["input_ids"].to(get_device()),
+                                attention_mask=inputs.data["attention_mask"].to(
+                                    get_device()
+                                ),
                             ).tolist()
                     else:
                         logits = self.model.encode(prompts).tolist()
                     out_queue.put(ModelOutput(embed_logits=logits))
                 elif self.model_type == "cross_encoder":
                     inputs = self.tokenizer(
                         prompts, padding=True, return_tensors="pt"
-                    ).to("cuda")
+                    ).to(get_device())
                     scores = self.model(**inputs).logits
                     scores = scores.squeeze().tolist()
                     if not isinstance(scores, list):
@@ -369,7 +373,7 @@ def start_model_process(
                         )
                         conv_tokenized = self.tokenizer(
                             conv_formatted, return_tensors="pt"
-                        ).to("cuda")
+                        ).to(get_device())
                         scores.append(
                             float(self.model(**conv_tokenized).logits[0][0].item())
                         )
@@ -426,9 +430,9 @@ def forward_generation_raw(
 
         for i, p in enumerate(prompts):
             if isinstance(p, str):
-                input_ids = tokenizer.encode(p, return_tensors="pt").cuda()
+                input_ids = tokenizer.encode(p, return_tensors="pt").to(get_device())
             else:
-                input_ids = torch.tensor([p], device="cuda")
+                input_ids = torch.tensor([p], device=get_device())
 
             if lora_paths is not None and lora_paths[i] is not None:
                 from peft import PeftModel

diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
@@ -37,7 +37,9 @@
 from sglang.srt.utils import (
     get_bool_env_var,
     get_device,
+    is_cuda,
     is_port_available,
+    is_xpu,
     kill_process_tree,
     retry,
 )
@@ -1474,11 +1476,6 @@ def run_bench_one_batch(model, other_args):
         device: Device type ("auto", "cuda", "rocm" or "cpu").
                 If "auto", will detect available platforms automatically.
     """
-    # Auto-detect device if needed
 except (RuntimeError, ImportError) as e: 
 except (RuntimeError, ImportError) as e: 
-
-    device = auto_config_device()
-    print(f"Auto-configed device: {device}", flush=True)
-    other_args += ["--device", str(device)]
 
     command = [
         "python3",
@@ -2243,6 +2240,44 @@ def wrapper(self):
     return decorator
 
 
+def get_gpu_count():
+    if get_device() == "cpu":
+        gpu_count = 0
+    else:
+        gpu_count = torch.accelerator.device_count()
+    return gpu_count
+
+
+def empty_gpu_cache():
+    """
+    Unified empty_cache for PyTorch 2.8 (no torch.accelerator)
+    and PyTorch 2.9+ (where torch.accelerator.empty_cache() exists).
+    """
+    if hasattr(torch, "accelerator") and hasattr(torch.accelerator, "empty_cache"):
+        return torch.accelerator.empty_cache()
+
+    # CUDA
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        return
+
+    # XPU (Intel)
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        torch.xpu.empty_cache()
+        return
+
+    return
+
+
+def get_gpu_memory_gb():
+    if is_cuda():
+        return torch.cuda.device_memory_used() / 1024**3
+    elif is_xpu():
+        return torch.xpu.memory_allocated() / 1024**3
+    else:
+        return 0
+
+
 def run_doctests(obj: Callable[..., Any] | ModuleType):
     mod = inspect.getmodule(obj)
     globals = dict(mod.__dict__)

diff --git a/test/manual/test_expert_location_updater.py b/test/manual/test_expert_location_updater.py
@@ -10,6 +10,7 @@
 from torch.multiprocessing import Process
 
 from sglang.srt.eplb import expert_location_updater
+from sglang.srt.utils import get_device
 from sglang.test.test_utils import CustomTestCase, find_available_port
 from sglang.utils import is_in_ci
 
@@ -61,7 +62,7 @@ def test_cpu_slow(self):
     def test_gpu(self):
         if is_in_ci():
             return
-        self._test_common(device="cuda")
+        self._test_common(device=get_device())
 
     def _test_common(self, device):
         infos = []
@@ -135,6 +136,8 @@ def _run_subprocess(
         )
         if device == "cuda":
             torch.cuda.set_device(f"cuda:{rank}")
+        if device == "xpu":
+            torch.xpu.set_device(f"xpu:{rank}")
 
         for info in infos:
             _execute_test(info, rank=rank, num_gpus=num_gpus, device=device)

diff --git a/test/manual/test_forward_split_prefill.py b/test/manual/test_forward_split_prefill.py
@@ -20,6 +20,7 @@
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import get_device
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase
 
@@ -32,7 +33,7 @@ def setUpClass(cls):
         """Set up the test environment once for all tests."""
         cls.model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
         cls.tp_size = 1
-        cls.device = "cuda"
+        cls.device = get_device()
 
         # Initialize server args
         cls.server_args = ServerArgs(

diff --git a/test/manual/test_get_weights_by_name.py b/test/manual/test_get_weights_by_name.py
@@ -3,16 +3,18 @@
 
 import numpy as np
 import requests
-import torch
 from transformers import AutoModelForCausalLM
 
 import sglang as sgl
+from sglang.srt.utils import get_device
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
+    empty_gpu_cache,
+    get_gpu_count,
     is_in_ci,
     popen_launch_server,
 )
@@ -32,7 +34,7 @@ class TestGetWeightsByName(CustomTestCase):
     def init_hf_model(self, model_name, tie_word_embeddings):
         self.hf_model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype="bfloat16", tie_word_embeddings=tie_word_embeddings
-        ).to("cuda:0")
+        ).to(get_device())
 
     def init_backend(self, backend, dp, tp, model_name):
         self.backend = backend
@@ -61,7 +63,7 @@ def init_backend(self, backend, dp, tp, model_name):
     def clean_up(self):
         del self.hf_model
         gc.collect()
-        torch.cuda.empty_cache()
+        empty_gpu_cache()
         if self.backend == "Engine":
             self.engine.shutdown()
         else:
@@ -132,11 +134,11 @@ def test_get_weights_by_name(self):
                 ("Runtime", 1, 1, DEFAULT_SMALL_MODEL_NAME_FOR_TEST),
                 ("Engine", 1, 1, DEFAULT_MODEL_NAME_FOR_TEST),
             ]
-            if torch.cuda.device_count() >= 2:
+            if get_gpu_count() >= 2:
                 test_suits.append(("Engine", 1, 2, DEFAULT_SMALL_MODEL_NAME_FOR_TEST))
                 test_suits.append(("Runtime", 2, 1, DEFAULT_MODEL_NAME_FOR_TEST))
 
-            if torch.cuda.device_count() >= 4:
+            if get_gpu_count() >= 4:
                 test_suits.extend(
                     [
                         ("Engine", 2, 2, DEFAULT_SMALL_MODEL_NAME_FOR_TEST),

diff --git a/test/manual/test_triton_moe_wna16.py b/test/manual/test_triton_moe_wna16.py
@@ -7,6 +7,7 @@
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
 from sglang.srt.layers.moe.topk import TopKConfig, select_experts
 from sglang.srt.server_args import ServerArgs, set_global_server_args_for_scheduler
+from sglang.srt.utils import get_device
 
 NUM_EXPERTS = [8, 64]
 TOP_KS = [2, 6]
@@ -159,10 +160,10 @@ def test_fused_moe_wn16(
     weight_bits: int,
 ):
     print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits)
-    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    a = torch.randn((m, k), device=get_device(), dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device=get_device(), dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device=get_device(), dtype=dtype) / 10
+    score = torch.randn((m, e), device=get_device(), dtype=dtype)
 
     if weight_bits == 4:
         pack_factor = 2
@@ -174,16 +175,22 @@ def test_fused_moe_wn16(
     w1_ref = w1.clone()
     w2_ref = w2.clone()
     w1_qweight = torch.empty(
-        (e, 2 * n, k // pack_factor), device="cuda", dtype=torch.uint8
+        (e, 2 * n, k // pack_factor), device=get_device(), dtype=torch.uint8
     )
-    w2_qweight = torch.empty((e, k, n // pack_factor), device="cuda", dtype=torch.uint8)
-    w1_scales = torch.empty((e, 2 * n, k // group_size), device="cuda", dtype=dtype)
-    w2_scales = torch.empty((e, k, n // group_size), device="cuda", dtype=dtype)
+    w2_qweight = torch.empty(
+        (e, k, n // pack_factor), device=get_device(), dtype=torch.uint8
+    )
+    w1_scales = torch.empty(
+        (e, 2 * n, k // group_size), device=get_device(), dtype=dtype
+    )
+    w2_scales = torch.empty((e, k, n // group_size), device=get_device(), dtype=dtype)
     w1_qzeros = torch.empty(
-        (e, 2 * n // pack_factor, k // group_size), device="cuda", dtype=torch.uint8
+        (e, 2 * n // pack_factor, k // group_size),
+        device=get_device(),
+        dtype=torch.uint8,
     )
     w2_qzeros = torch.empty(
-        (e, k // pack_factor, n // group_size), device="cuda", dtype=torch.uint8
+        (e, k // pack_factor, n // group_size), device=get_device(), dtype=torch.uint8
     )
 
     for i in range(e * 2):

diff --git a/test/registered/attention/test_create_kvindices.py b/test/registered/attention/test_create_kvindices.py
@@ -4,6 +4,7 @@
 import torch
 
 from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.utils import get_device
 from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
 from sglang.test.test_utils import CustomTestCase
 
@@ -15,30 +16,28 @@
 class TestCreateKvIndices(CustomTestCase):
     @classmethod
     def setUpClass(cls):
-        if not torch.cuda.is_available():
-            raise unittest.SkipTest("CUDA is not available")
-        torch.set_default_device("cuda")
+        torch.set_default_device(get_device())
 
     def _run_test(self, batch, max_batch, max_context_len):
         req_to_token = torch.arange(
-            max_batch * max_context_len, dtype=torch.int32, device="cuda"
+            max_batch * max_context_len, dtype=torch.int32, device=get_device()
         ).reshape((max_batch, max_context_len))
         req_pool_indices = torch.tensor(
             torch.from_numpy(
                 np.random.choice(range(max_batch), size=batch, replace=False)
             ),
             dtype=torch.int32,
-            device="cuda",
+            device=get_device(),
         )
         paged_kernel_lens = torch.tensor(
             torch.from_numpy(
                 np.random.choice(range(max_context_len), size=batch, replace=False)
             ),
             dtype=torch.int32,
-            device="cuda",
+            device=get_device(),
         )
 
-        kv_indptr = torch.zeros((batch + 1,), dtype=torch.int32, device="cuda")
+        kv_indptr = torch.zeros((batch + 1,), dtype=torch.int32, device=get_device())
         kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
 
         # ref
@@ -53,7 +52,9 @@ def _run_test(self, batch, max_batch, max_context_len):
         ).contiguous()
 
         # triton
-        kv_indices_triton = torch.empty(kv_indptr[-1], dtype=torch.int32, device="cuda")
+        kv_indices_triton = torch.empty(
+            kv_indptr[-1], dtype=torch.int32, device=get_device()
+        )
         create_flashinfer_kv_indices_triton[(batch,)](
             req_to_token,
             req_pool_indices,