Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/paddle/utils/cpp_extension/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
CppExtension,
CUDAExtension,
_compute_worker_number, # noqa: F401
_get_cuda_arch_flags, # noqa: F401
_get_num_workers, # noqa: F401
_get_pybind11_abi_build_flags, # noqa: F401
load,
setup,
)
from .extension_utils import (
_get_cuda_arch_flags, # noqa: F401
get_build_directory,
load_op_meta_info_and_register_op, # noqa: F401
parse_op_info, # noqa: F401
Expand Down
112 changes: 0 additions & 112 deletions python/paddle/utils/cpp_extension/cpp_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
import concurrent
import functools
import re
import warnings
import collections
import setuptools
import sys
import paddle
Expand Down Expand Up @@ -1478,116 +1476,6 @@ def load(
return custom_op_api


def _get_cuda_arch_flags(cflags: list[str] | None = None) -> list[str]:
"""
Determine CUDA arch flags to use.

For an arch, say "6.1", the added compile flag will be
``-gencode=arch=compute_61,code=sm_61``.
For an added "+PTX", an additional
``-gencode=arch=compute_xx,code=compute_xx`` is added.
"""
# If cflags is given, there may already be user-provided arch flags in it
if cflags is not None:
for flag in cflags:
if any(x in flag for x in ['PADDLE_EXTENSION_NAME']):
continue
if 'arch' in flag:
return []

named_arches = collections.OrderedDict(
[
('Pascal', '6.0;6.1+PTX'),
('Volta+Tegra', '7.2'),
('Volta', '7.0+PTX'),
('Turing', '7.5+PTX'),
('Ampere+Tegra', '8.7'),
('Ampere', '8.0;8.6+PTX'),
('Ada', '8.9+PTX'),
('Hopper', '9.0+PTX'),
('Blackwell+Tegra', '10.1'),
('Blackwell', '10.0;12.0+PTX'),
]
)

supported_arches = [
'6.0',
'6.1',
'6.2',
'7.0',
'7.2',
'7.5',
'8.0',
'8.6',
'8.7',
'8.9',
'9.0',
'9.0a',
'10.0',
'10.0a',
'10.1',
'10.1a',
'12.0',
'12.0a',
]
valid_arch_strings = supported_arches + [
s + "+PTX" for s in supported_arches
]

_arch_list = os.environ.get("PADDLE_CUDA_ARCH_LIST")

if not _arch_list:
warnings.warn(
"PADDLE_CUDA_ARCH_LIST are not set, all archs for visible cards are included for compilation. \n"
"If this is not desired, please set os.environ['PADDLE_CUDA_ARCH_LIST']."
)
arch_list = []
dev_types = core.get_all_custom_device_type()
if core.is_compiled_with_cuda():
for dev_id in range(paddle.device.cuda.device_count()):
capability = paddle.device.cuda.get_device_capability(
dev_id
) # (major, minor)
arch = f"{capability[0]}.{capability[1]}"
if arch not in arch_list:
arch_list.append(arch)
arch_list = sorted(arch_list)
if arch_list:
arch_list[-1] += '+PTX'
elif dev_types and core.is_compiled_with_custom_device(dev_types[0]):
for dev_id in range(paddle.device.device_count()):
capability = paddle.device.get_device_capability(
dev_types[0], dev_id
)
arch = f"{capability[0]}.{capability[1]}"
if arch not in arch_list:
arch_list.append(arch)
arch_list = sorted(arch_list)
if arch_list:
arch_list[-1] += '+PTX'
else:
raise RuntimeError(
"Paddle is not compiled with CUDA or Custom Device, cannot determine CUDA arch."
)
else:
_arch_list = _arch_list.replace(' ', ';')
for named_arch, archival in named_arches.items():
_arch_list = _arch_list.replace(named_arch, archival)
arch_list = _arch_list.split(';')

flags = []
for arch in arch_list:
if arch not in valid_arch_strings:
raise ValueError(f"Unknown CUDA arch ({arch}) or GPU not supported")
version = arch.split('+')[0]
major, minor = version.split('.')
num = f"{major}{minor}"
flags.append(f"-gencode=arch=compute_{num},code=sm_{num}")
if arch.endswith('+PTX'):
flags.append(f"-gencode=arch=compute_{num},code=compute_{num}")
return sorted(set(flags))


def _get_pybind11_abi_build_flags():
abi_cflags = []
for pname in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
Expand Down
136 changes: 106 additions & 30 deletions python/paddle/utils/cpp_extension/extension_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

import atexit
import collections
import copy
import glob
import hashlib
import importlib.abc
Expand All @@ -33,6 +32,8 @@
import warnings
from importlib import machinery

import paddle

try:
from subprocess import DEVNULL # py3
except ImportError:
Expand Down Expand Up @@ -393,7 +394,7 @@ def prepare_unix_cudaflags(cflags):
'--expt-relaxed-constexpr',
'-DNVCC',
*cflags,
*get_cuda_arch_flags(cflags),
*_get_cuda_arch_flags(cflags),
]

return cflags
Expand All @@ -403,7 +404,7 @@ def prepare_win_cudaflags(cflags):
"""
Prepare all necessary compiled flags for nvcc compiling CUDA files.
"""
cflags = [*COMMON_NVCC_FLAGS, '-w', *cflags, *get_cuda_arch_flags(cflags)]
cflags = [*COMMON_NVCC_FLAGS, '-w', *cflags, *_get_cuda_arch_flags(cflags)]

return cflags

Expand All @@ -419,15 +420,114 @@ def add_std_without_repeat(cflags, compiler_type, use_std17=False):
cflags.append(cpp_flag)


def get_cuda_arch_flags(cflags):
def _get_cuda_arch_flags(cflags: list[str] | None = None) -> list[str]:
"""
Determine CUDA arch flags to use.

For an arch, say "6.1", the added compile flag will be
``-gencode=arch=compute_61,code=sm_61``.
For an added "+PTX", an additional
``-gencode=arch=compute_xx,code=compute_xx`` is added.
"""
# TODO(Aurelius84):
return []
# If cflags is given, there may already be user-provided arch flags in it
if cflags is not None:
for flag in cflags:
if any(x in flag for x in ['PADDLE_EXTENSION_NAME']):
continue
if 'arch' in flag:
return []

named_arches = collections.OrderedDict(
[
('Pascal', '6.0;6.1+PTX'),
('Volta+Tegra', '7.2'),
('Volta', '7.0+PTX'),
('Turing', '7.5+PTX'),
('Ampere+Tegra', '8.7'),
('Ampere', '8.0;8.6+PTX'),
('Ada', '8.9+PTX'),
('Hopper', '9.0+PTX'),
('Blackwell+Tegra', '10.1'),
('Blackwell', '10.0;12.0+PTX'),
]
)

supported_arches = [
'6.0',
'6.1',
'6.2',
'7.0',
'7.2',
'7.5',
'8.0',
'8.6',
'8.7',
'8.9',
'9.0',
'9.0a',
'10.0',
'10.0a',
'10.1',
'10.1a',
'12.0',
'12.0a',
]
valid_arch_strings = supported_arches + [
s + "+PTX" for s in supported_arches
]

_arch_list = os.environ.get("PADDLE_CUDA_ARCH_LIST")

if not _arch_list:
warnings.warn(
"PADDLE_CUDA_ARCH_LIST are not set, all archs for visible cards are included for compilation. \n"
"If this is not desired, please set os.environ['PADDLE_CUDA_ARCH_LIST']."
)
arch_list = []
dev_types = core.get_all_custom_device_type()
if core.is_compiled_with_cuda():
for dev_id in range(paddle.device.cuda.device_count()):
capability = paddle.device.cuda.get_device_capability(
dev_id
) # (major, minor)
arch = f"{capability[0]}.{capability[1]}"
if arch not in arch_list:
arch_list.append(arch)
arch_list = sorted(arch_list)
if arch_list:
arch_list[-1] += '+PTX'
elif dev_types and core.is_compiled_with_custom_device(dev_types[0]):
for dev_id in range(paddle.device.device_count()):
capability = paddle.device.get_device_capability(
dev_types[0], dev_id
)
arch = f"{capability[0]}.{capability[1]}"
if arch not in arch_list:
arch_list.append(arch)
arch_list = sorted(arch_list)
if arch_list:
arch_list[-1] += '+PTX'
else:
raise RuntimeError(
"Paddle is not compiled with CUDA or Custom Device, cannot determine CUDA arch."
)
else:
_arch_list = _arch_list.replace(' ', ';')
for named_arch, archival in named_arches.items():
_arch_list = _arch_list.replace(named_arch, archival)
arch_list = _arch_list.split(';')

flags = []
for arch in arch_list:
if arch not in valid_arch_strings:
raise ValueError(f"Unknown CUDA arch ({arch}) or GPU not supported")
version = arch.split('+')[0]
major, minor = version.split('.')
num = f"{major}{minor}"
flags.append(f"-gencode=arch=compute_{num},code=sm_{num}")
if arch.endswith('+PTX'):
flags.append(f"-gencode=arch=compute_{num},code=compute_{num}")
return sorted(set(flags))


def get_rocm_arch_flags(cflags):
Expand Down Expand Up @@ -612,30 +712,6 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
if compile_dir is None:
# Add this compile option to isolate base headers
add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_CUSTOM_KERNEL'])
if core.is_compiled_with_cuda():
arch_list = os.getenv("PADDLE_CUDA_ARCH_LIST")
if arch_list:
arch_list = [
s.strip() for s in re.split(r";|\s|\,", arch_list) if s.strip()
]
nvcc_options = list(extra_compile_args.get("nvcc", []))
sms = []
for s in arch_list:
sm = [int(ss) for ss in s.split(".") if ss]
assert len(sm) in [1, 2], f"invalid sm format: {s}"
if len(sm) == 2:
sm = sm[0] * 10 + sm[1]
else:
sm = sm[0]
sms.append(sm)

sms = sorted(set(sms))
for sm in sms:
nvcc_options.extend(
["-gencode", f"arch=compute_{sm},code=sm_{sm}"]
)
extra_compile_args = copy.deepcopy(extra_compile_args)
extra_compile_args["nvcc"] = nvcc_options

kwargs['extra_compile_args'] = extra_compile_args

Expand Down
Loading