diff --git a/python/paddle/utils/cpp_extension/__init__.py b/python/paddle/utils/cpp_extension/__init__.py index 267323cf39273d..4229ae482e1444 100644 --- a/python/paddle/utils/cpp_extension/__init__.py +++ b/python/paddle/utils/cpp_extension/__init__.py @@ -19,13 +19,13 @@ CppExtension, CUDAExtension, _compute_worker_number, # noqa: F401 - _get_cuda_arch_flags, # noqa: F401 _get_num_workers, # noqa: F401 _get_pybind11_abi_build_flags, # noqa: F401 load, setup, ) from .extension_utils import ( + _get_cuda_arch_flags, # noqa: F401 get_build_directory, load_op_meta_info_and_register_op, # noqa: F401 parse_op_info, # noqa: F401 diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py index 7dfdaea3580db0..3de67dc81a49eb 100644 --- a/python/paddle/utils/cpp_extension/cpp_extension.py +++ b/python/paddle/utils/cpp_extension/cpp_extension.py @@ -21,8 +21,6 @@ import concurrent import functools import re -import warnings -import collections import setuptools import sys import paddle @@ -1478,116 +1476,6 @@ def load( return custom_op_api -def _get_cuda_arch_flags(cflags: list[str] | None = None) -> list[str]: - """ - Determine CUDA arch flags to use. - - For an arch, say "6.1", the added compile flag will be - ``-gencode=arch=compute_61,code=sm_61``. - For an added "+PTX", an additional - ``-gencode=arch=compute_xx,code=compute_xx`` is added. - """ - # If cflags is given, there may already be user-provided arch flags in it - if cflags is not None: - for flag in cflags: - if any(x in flag for x in ['PADDLE_EXTENSION_NAME']): - continue - if 'arch' in flag: - return [] - - named_arches = collections.OrderedDict( - [ - ('Pascal', '6.0;6.1+PTX'), - ('Volta+Tegra', '7.2'), - ('Volta', '7.0+PTX'), - ('Turing', '7.5+PTX'), - ('Ampere+Tegra', '8.7'), - ('Ampere', '8.0;8.6+PTX'), - ('Ada', '8.9+PTX'), - ('Hopper', '9.0+PTX'), - ('Blackwell+Tegra', '10.1'), - ('Blackwell', '10.0;12.0+PTX'), - ] - ) - - supported_arches = [ - '6.0', - '6.1', - '6.2', - '7.0', - '7.2', - '7.5', - '8.0', - '8.6', - '8.7', - '8.9', - '9.0', - '9.0a', - '10.0', - '10.0a', - '10.1', - '10.1a', - '12.0', - '12.0a', - ] - valid_arch_strings = supported_arches + [ - s + "+PTX" for s in supported_arches - ] - - _arch_list = os.environ.get("PADDLE_CUDA_ARCH_LIST") - - if not _arch_list: - warnings.warn( - "PADDLE_CUDA_ARCH_LIST are not set, all archs for visible cards are included for compilation. \n" - "If this is not desired, please set os.environ['PADDLE_CUDA_ARCH_LIST']." - ) - arch_list = [] - dev_types = core.get_all_custom_device_type() - if core.is_compiled_with_cuda(): - for dev_id in range(paddle.device.cuda.device_count()): - capability = paddle.device.cuda.get_device_capability( - dev_id - ) # (major, minor) - arch = f"{capability[0]}.{capability[1]}" - if arch not in arch_list: - arch_list.append(arch) - arch_list = sorted(arch_list) - if arch_list: - arch_list[-1] += '+PTX' - elif dev_types and core.is_compiled_with_custom_device(dev_types[0]): - for dev_id in range(paddle.device.device_count()): - capability = paddle.device.get_device_capability( - dev_types[0], dev_id - ) - arch = f"{capability[0]}.{capability[1]}" - if arch not in arch_list: - arch_list.append(arch) - arch_list = sorted(arch_list) - if arch_list: - arch_list[-1] += '+PTX' - else: - raise RuntimeError( - "Paddle is not compiled with CUDA or Custom Device, cannot determine CUDA arch." - ) - else: - _arch_list = _arch_list.replace(' ', ';') - for named_arch, archival in named_arches.items(): - _arch_list = _arch_list.replace(named_arch, archival) - arch_list = _arch_list.split(';') - - flags = [] - for arch in arch_list: - if arch not in valid_arch_strings: - raise ValueError(f"Unknown CUDA arch ({arch}) or GPU not supported") - version = arch.split('+')[0] - major, minor = version.split('.') - num = f"{major}{minor}" - flags.append(f"-gencode=arch=compute_{num},code=sm_{num}") - if arch.endswith('+PTX'): - flags.append(f"-gencode=arch=compute_{num},code=compute_{num}") - return sorted(set(flags)) - - def _get_pybind11_abi_build_flags(): abi_cflags = [] for pname in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]: diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 399db358d9a391..61e5b9a66630c4 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -16,7 +16,6 @@ import atexit import collections -import copy import glob import hashlib import importlib.abc @@ -33,6 +32,8 @@ import warnings from importlib import machinery +import paddle + try: from subprocess import DEVNULL # py3 except ImportError: @@ -393,7 +394,7 @@ def prepare_unix_cudaflags(cflags): '--expt-relaxed-constexpr', '-DNVCC', *cflags, - *get_cuda_arch_flags(cflags), + *_get_cuda_arch_flags(cflags), ] return cflags @@ -403,7 +404,7 @@ def prepare_win_cudaflags(cflags): """ Prepare all necessary compiled flags for nvcc compiling CUDA files. """ - cflags = [*COMMON_NVCC_FLAGS, '-w', *cflags, *get_cuda_arch_flags(cflags)] + cflags = [*COMMON_NVCC_FLAGS, '-w', *cflags, *_get_cuda_arch_flags(cflags)] return cflags @@ -419,15 +420,114 @@ def add_std_without_repeat(cflags, compiler_type, use_std17=False): cflags.append(cpp_flag) -def get_cuda_arch_flags(cflags): +def _get_cuda_arch_flags(cflags: list[str] | None = None) -> list[str]: """ + Determine CUDA arch flags to use. + For an arch, say "6.1", the added compile flag will be ``-gencode=arch=compute_61,code=sm_61``. For an added "+PTX", an additional ``-gencode=arch=compute_xx,code=compute_xx`` is added. """ - # TODO(Aurelius84): - return [] + # If cflags is given, there may already be user-provided arch flags in it + if cflags is not None: + for flag in cflags: + if any(x in flag for x in ['PADDLE_EXTENSION_NAME']): + continue + if 'arch' in flag: + return [] + + named_arches = collections.OrderedDict( + [ + ('Pascal', '6.0;6.1+PTX'), + ('Volta+Tegra', '7.2'), + ('Volta', '7.0+PTX'), + ('Turing', '7.5+PTX'), + ('Ampere+Tegra', '8.7'), + ('Ampere', '8.0;8.6+PTX'), + ('Ada', '8.9+PTX'), + ('Hopper', '9.0+PTX'), + ('Blackwell+Tegra', '10.1'), + ('Blackwell', '10.0;12.0+PTX'), + ] + ) + + supported_arches = [ + '6.0', + '6.1', + '6.2', + '7.0', + '7.2', + '7.5', + '8.0', + '8.6', + '8.7', + '8.9', + '9.0', + '9.0a', + '10.0', + '10.0a', + '10.1', + '10.1a', + '12.0', + '12.0a', + ] + valid_arch_strings = supported_arches + [ + s + "+PTX" for s in supported_arches + ] + + _arch_list = os.environ.get("PADDLE_CUDA_ARCH_LIST") + + if not _arch_list: + warnings.warn( + "PADDLE_CUDA_ARCH_LIST are not set, all archs for visible cards are included for compilation. \n" + "If this is not desired, please set os.environ['PADDLE_CUDA_ARCH_LIST']." + ) + arch_list = [] + dev_types = core.get_all_custom_device_type() + if core.is_compiled_with_cuda(): + for dev_id in range(paddle.device.cuda.device_count()): + capability = paddle.device.cuda.get_device_capability( + dev_id + ) # (major, minor) + arch = f"{capability[0]}.{capability[1]}" + if arch not in arch_list: + arch_list.append(arch) + arch_list = sorted(arch_list) + if arch_list: + arch_list[-1] += '+PTX' + elif dev_types and core.is_compiled_with_custom_device(dev_types[0]): + for dev_id in range(paddle.device.device_count()): + capability = paddle.device.get_device_capability( + dev_types[0], dev_id + ) + arch = f"{capability[0]}.{capability[1]}" + if arch not in arch_list: + arch_list.append(arch) + arch_list = sorted(arch_list) + if arch_list: + arch_list[-1] += '+PTX' + else: + raise RuntimeError( + "Paddle is not compiled with CUDA or Custom Device, cannot determine CUDA arch." + ) + else: + _arch_list = _arch_list.replace(' ', ';') + for named_arch, archival in named_arches.items(): + _arch_list = _arch_list.replace(named_arch, archival) + arch_list = _arch_list.split(';') + + flags = [] + for arch in arch_list: + if arch not in valid_arch_strings: + raise ValueError(f"Unknown CUDA arch ({arch}) or GPU not supported") + version = arch.split('+')[0] + major, minor = version.split('.') + num = f"{major}{minor}" + flags.append(f"-gencode=arch=compute_{num},code=sm_{num}") + if arch.endswith('+PTX'): + flags.append(f"-gencode=arch=compute_{num},code=compute_{num}") + return sorted(set(flags)) def get_rocm_arch_flags(cflags): @@ -612,30 +712,6 @@ def normalize_extension_kwargs(kwargs, use_cuda=False): if compile_dir is None: # Add this compile option to isolate base headers add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_CUSTOM_KERNEL']) - if core.is_compiled_with_cuda(): - arch_list = os.getenv("PADDLE_CUDA_ARCH_LIST") - if arch_list: - arch_list = [ - s.strip() for s in re.split(r";|\s|\,", arch_list) if s.strip() - ] - nvcc_options = list(extra_compile_args.get("nvcc", [])) - sms = [] - for s in arch_list: - sm = [int(ss) for ss in s.split(".") if ss] - assert len(sm) in [1, 2], f"invalid sm format: {s}" - if len(sm) == 2: - sm = sm[0] * 10 + sm[1] - else: - sm = sm[0] - sms.append(sm) - - sms = sorted(set(sms)) - for sm in sms: - nvcc_options.extend( - ["-gencode", f"arch=compute_{sm},code=sm_{sm}"] - ) - extra_compile_args = copy.deepcopy(extra_compile_args) - extra_compile_args["nvcc"] = nvcc_options kwargs['extra_compile_args'] = extra_compile_args