Skip to content

Commit 3bdecdb

Browse files
author
XLC127
committed
add support for Metax GPU
Signed-off-by: XLC127 <[email protected]>
1 parent edc130a commit 3bdecdb

File tree

7 files changed

+254
-1
lines changed

7 files changed

+254
-1
lines changed

doc/source/ray-core/scheduling/accelerators.rst

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ The accelerators natively supported by Ray Core are:
3939
* - Rebellions RBLN
4040
- RBLN
4141
- Experimental, supported by the community
42-
42+
* - METAX GPU
43+
- GPU
44+
- Experimental, supported by the community
4345
Starting Ray nodes with accelerators
4446
------------------------------------
4547

@@ -130,6 +132,15 @@ If you need to, you can :ref:`override <specify-node-resources>` this.
130132
For example, ``RBLN_DEVICES=1,3 ray start --head --resources='{"RBLN": 2}'``
131133
lets Ray only see devices 1 and 3.
132134

135+
.. tab-item:: METAX GPU
136+
:sync: METAX GPU
137+
138+
.. tip::
139+
140+
You can set the ``CUDA_VISIBLE_DEVICES`` environment variable before starting a Ray node
141+
to limit the METAX GPUs that are visible to Ray.
142+
For example, ``CUDA_VISIBLE_DEVICES=1,3 ray start --head --num-gpus=2``
143+
lets Ray only see devices 1 and 3.
133144
.. note::
134145

135146
There's nothing preventing you from specifying a larger number of
@@ -457,6 +468,44 @@ and assign accelerators to the task or actor by setting the corresponding enviro
457468
(rbln_task pid=51830) RBLN IDs: [1]
458469
(rbln_task pid=51830) RBLN_DEVICES: 1
459470

471+
.. tab-item:: METAX GPU
472+
:sync: METAX GPU
473+
474+
.. testcode::
475+
:hide:
476+
477+
ray.shutdown()
478+
479+
.. testcode::
480+
481+
import os
482+
import ray
483+
484+
ray.init(num_gpus=2)
485+
486+
@ray.remote(num_gpus=1)
487+
class GPUActor:
488+
def ping(self):
489+
print("GPU IDs: {}".format(ray.get_runtime_context().get_accelerator_ids()["GPU"]))
490+
print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
491+
492+
@ray.remote(num_gpus=1)
493+
def gpu_task():
494+
print("GPU IDs: {}".format(ray.get_runtime_context().get_accelerator_ids()["GPU"]))
495+
print("CUDA_VISIBLE_DEVICES: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
496+
497+
gpu_actor = GPUActor.remote()
498+
ray.get(gpu_actor.ping.remote())
499+
# The actor uses the first GPU so the task uses the second one.
500+
ray.get(gpu_task.remote())
501+
502+
.. testoutput::
503+
:options: +MOCK
504+
505+
(GPUActor pid=52420) GPU IDs: [0]
506+
(GPUActor pid=52420) CUDA_VISIBLE_DEVICES: 0
507+
(gpu_task pid=51830) GPU IDs: [1]
508+
(gpu_task pid=51830) CUDA_VISIBLE_DEVICES: 1
460509

461510
Inside a task or actor, :func:`ray.get_runtime_context().get_accelerator_ids() <ray.runtime_context.RuntimeContext.get_accelerator_ids>` returns a
462511
list of accelerator IDs that are available to the task or actor.
@@ -606,6 +655,27 @@ so multiple tasks and actors can share the same accelerator.
606655

607656
Rebellions RBLN doesn't support fractional resources.
608657

658+
.. tab-item:: METAX GPU
659+
:sync: METAX GPU
660+
661+
.. testcode::
662+
:hide:
663+
664+
ray.shutdown()
665+
666+
.. testcode::
667+
668+
ray.init(num_cpus=4, num_gpus=1)
669+
670+
@ray.remote(num_gpus=0.25)
671+
def f():
672+
import time
673+
674+
time.sleep(1)
675+
676+
# The four tasks created here can execute concurrently
677+
# and share the same GPU.
678+
ray.get([f.remote() for _ in range(4)])
609679

610680
**Note:** It is the user's responsibility to make sure that the individual tasks
611681
don't use more than their share of the accelerator memory.

python/ray/_private/accelerators/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from ray._private.accelerators.amd_gpu import AMDGPUAcceleratorManager
88
from ray._private.accelerators.hpu import HPUAcceleratorManager
99
from ray._private.accelerators.intel_gpu import IntelGPUAcceleratorManager
10+
from ray._private.accelerators.metax_gpu import MetaxGPUAcceleratorManager
1011
from ray._private.accelerators.neuron import NeuronAcceleratorManager
1112
from ray._private.accelerators.npu import NPUAcceleratorManager
1213
from ray._private.accelerators.nvidia_gpu import NvidiaGPUAcceleratorManager
@@ -25,6 +26,7 @@ def get_all_accelerator_managers() -> Set[AcceleratorManager]:
2526
HPUAcceleratorManager,
2627
NPUAcceleratorManager,
2728
RBLNAcceleratorManager,
29+
MetaxGPUAcceleratorManager,
2830
}
2931

3032

@@ -60,6 +62,8 @@ def get_accelerator_manager_for_resource(
6062
resource_name_to_accelerator_manager["GPU"] = AMDGPUAcceleratorManager
6163
elif IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
6264
resource_name_to_accelerator_manager["GPU"] = IntelGPUAcceleratorManager
65+
elif MetaxGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
66+
resource_name_to_accelerator_manager["GPU"] = MetaxGPUAcceleratorManager
6367
else:
6468
resource_name_to_accelerator_manager["GPU"] = NvidiaGPUAcceleratorManager
6569
get_accelerator_manager_for_resource._resource_name_to_accelerator_manager = (
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import logging
2+
import os
3+
from typing import List, Optional, Tuple
4+
5+
from ray._private.accelerators.accelerator import AcceleratorManager
6+
7+
logger = logging.getLogger(__name__)
8+
9+
CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES"
10+
NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
11+
12+
13+
class MetaxGPUAcceleratorManager(AcceleratorManager):
14+
"""Metax GPU accelerators."""
15+
16+
@staticmethod
17+
def get_resource_name() -> str:
18+
return "GPU"
19+
20+
@staticmethod
21+
def get_visible_accelerator_ids_env_var() -> str:
22+
return CUDA_VISIBLE_DEVICES_ENV_VAR
23+
24+
@staticmethod
25+
def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
26+
cuda_visible_devices = os.environ.get(
27+
MetaxGPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
28+
)
29+
if cuda_visible_devices is None:
30+
return None
31+
32+
if cuda_visible_devices == "":
33+
return []
34+
35+
if cuda_visible_devices == "NoDevFiles":
36+
return []
37+
38+
return list(cuda_visible_devices.split(","))
39+
40+
@staticmethod
41+
def get_current_node_num_accelerators() -> int:
42+
try:
43+
import pymxsml.mxsml_extension as pymxsml
44+
45+
try:
46+
pymxsml.mxSmlExInit()
47+
except pymxsml.MXSMLEXError:
48+
return 0
49+
device_count = pymxsml.mxSmlExDeviceGetCount()
50+
pymxsml.mxSmlExShutdown()
51+
return device_count
52+
except Exception as e:
53+
logger.debug("Could not import pymxsml: %s", e)
54+
return 0
55+
56+
@staticmethod
57+
def get_current_node_accelerator_type() -> Optional[str]:
58+
try:
59+
import pymxsml.mxsml_extension as pymxsml
60+
61+
try:
62+
pymxsml.mxSmlExInit()
63+
except pymxsml.MXSMLEXError:
64+
return None
65+
device_name = None
66+
device_count = pymxsml.mxSmlExDeviceGetCount()
67+
if device_count > 0:
68+
handle = pymxsml.mxSmlExDeviceGetHandleByIndex(0)
69+
device_name = pymxsml.mxSmlExDeviceGetName(handle)
70+
if isinstance(device_name, bytes):
71+
device_name = device_name.decode("utf-8")
72+
pymxsml.mxSmlExShutdown()
73+
return device_name
74+
except Exception:
75+
logger.exception("Failed to detect GPU type.")
76+
return None
77+
78+
@staticmethod
79+
def validate_resource_request_quantity(
80+
quantity: float,
81+
) -> Tuple[bool, Optional[str]]:
82+
return (True, None)
83+
84+
@staticmethod
85+
def set_current_process_visible_accelerator_ids(
86+
visible_cuda_devices: List[str],
87+
) -> None:
88+
if os.environ.get(NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR):
89+
return
90+
91+
os.environ[
92+
MetaxGPUAcceleratorManager.get_visible_accelerator_ids_env_var()
93+
] = ",".join([str(i) for i in visible_cuda_devices])

python/ray/tests/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,7 @@ py_test_module_list(
526526
"accelerators/test_accelerators.py",
527527
"accelerators/test_amd_gpu.py",
528528
"accelerators/test_intel_gpu.py",
529+
"accelerators/test_metax_gpu.py",
529530
"accelerators/test_npu.py",
530531
"accelerators/test_nvidia_gpu.py",
531532
"accelerators/test_rbln.py",
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import os
2+
import sys
3+
from unittest.mock import patch
4+
5+
import pytest
6+
7+
import ray
8+
from ray._private.accelerators import (
9+
MetaxGPUAcceleratorManager,
10+
get_accelerator_manager_for_resource,
11+
)
12+
13+
14+
@patch(
15+
"ray._private.accelerators.MetaxGPUAcceleratorManager.get_current_node_num_accelerators",
16+
return_value=4,
17+
)
18+
def test_visible_metax_gpu_ids(mock_get_num_accelerators, monkeypatch, shutdown_only):
19+
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,1,2")
20+
del get_accelerator_manager_for_resource._resource_name_to_accelerator_manager
21+
ray.init()
22+
assert mock_get_num_accelerators.called
23+
assert ray.available_resources()["GPU"] == 3
24+
25+
26+
def test_metax_gpu_type(shutdown_only):
27+
with patch(
28+
"ray._private.accelerators.MetaxGPUAcceleratorManager.get_current_node_accelerator_type",
29+
return_value="MXC500",
30+
):
31+
from ray.util import accelerators
32+
33+
ray.init()
34+
result = MetaxGPUAcceleratorManager.get_current_node_accelerator_type()
35+
assert result == accelerators.METAX_C500
36+
37+
38+
def test_get_current_process_visible_accelerator_ids(monkeypatch):
39+
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0")
40+
assert MetaxGPUAcceleratorManager.get_current_process_visible_accelerator_ids() == [
41+
"0"
42+
]
43+
44+
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0,4,7")
45+
assert MetaxGPUAcceleratorManager.get_current_process_visible_accelerator_ids() == [
46+
"0",
47+
"4",
48+
"7",
49+
]
50+
51+
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "")
52+
assert (
53+
MetaxGPUAcceleratorManager.get_current_process_visible_accelerator_ids() == []
54+
)
55+
56+
del os.environ["CUDA_VISIBLE_DEVICES"]
57+
assert (
58+
MetaxGPUAcceleratorManager.get_current_process_visible_accelerator_ids() is None
59+
)
60+
61+
62+
def test_set_current_process_visible_accelerator_ids():
63+
MetaxGPUAcceleratorManager.set_current_process_visible_accelerator_ids(["0"])
64+
assert os.environ["CUDA_VISIBLE_DEVICES"] == "0"
65+
66+
MetaxGPUAcceleratorManager.set_current_process_visible_accelerator_ids(["0", "1"])
67+
assert os.environ["CUDA_VISIBLE_DEVICES"] == "0,1"
68+
69+
MetaxGPUAcceleratorManager.set_current_process_visible_accelerator_ids(
70+
["0", "1", "7"]
71+
)
72+
assert os.environ["CUDA_VISIBLE_DEVICES"] == "0,1,7"
73+
74+
75+
if __name__ == "__main__":
76+
if os.environ.get("PARALLEL_CI"):
77+
sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
78+
else:
79+
sys.exit(pytest.main(["-sv", __file__]))

python/ray/util/accelerators/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
INTEL_GAUDI,
1818
INTEL_MAX_1100,
1919
INTEL_MAX_1550,
20+
METAX_C500,
21+
METAX_C550,
2022
NVIDIA_A100,
2123
NVIDIA_H100,
2224
NVIDIA_L4,
@@ -60,6 +62,8 @@
6062
"GOOGLE_TPU_V5P",
6163
"GOOGLE_TPU_V5LITEPOD",
6264
"GOOGLE_TPU_V6E",
65+
"METAX_C500",
66+
"METAX_C550",
6367
# Deprecated
6468
"NVIDIA_TESLA_A100",
6569
]

python/ray/util/accelerators/accelerators.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
GOOGLE_TPU_V6E = "TPU-V6E"
3737
HUAWEI_NPU_910B = "Ascend910B"
3838
HUAWEI_NPU_910B4 = "Ascend910B4"
39+
METAX_C500 = "MXC500"
40+
METAX_C550 = "MXC550"
3941

4042
# Use these instead of NVIDIA_A100 if you need a specific accelerator size. Note that
4143
# these labels are not auto-added to nodes, you'll have to add them manually in

0 commit comments

Comments
 (0)