Merge pull request #7894 from obilaniu/cudaupdates

Update unstable CUDA Module to support newest toolkits and drivers.
pull/8005/head
Jussi Pakkanen 4 years ago committed by GitHub
commit 913374834c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 21
      docs/markdown/Cuda-module.md
  2. 136
      mesonbuild/modules/unstable_cuda.py
  3. 60
      test cases/cuda/3 cudamodule/meson.build

@ -35,14 +35,14 @@ It offers several useful functions that are enumerated below.
_Since: 0.50.0_
``` meson
cuda.nvcc_arch_flags(nvcc_or_version, ...,
cuda.nvcc_arch_flags(cuda_version_string, ...,
detected: string_or_array)
```
Returns a list of `-gencode` flags that should be passed to `cuda_args:` in
order to compile a "fat binary" for the architectures/compute capabilities
enumerated in the positional argument(s). The flags shall be acceptable to
the NVCC compiler object `nvcc_or_version`, or its version string.
an NVCC with CUDA Toolkit version string `cuda_version_string`.
A set of architectures and/or compute capabilities may be specified by:
@ -71,14 +71,6 @@ mixed with architecture names or compute capabilities. Their interpretation is:
| `'Common'` | Relatively common CCs supported by given NVCC compiler. Generally excludes Tegra and Tesla devices. |
| `'Auto'` | The CCs provided by the `detected:` keyword, filtered for support by given NVCC compiler. |
As a special case, when `nvcc_arch_flags()` is invoked with
- an NVCC `compiler` object `nvcc`,
- `'Auto'` mode and
- no `detected:` keyword,
Meson uses `nvcc`'s architecture auto-detection results.
The supported architecture names and their corresponding compute capabilities
are:
@ -95,6 +87,7 @@ are:
| `'Volta'` | 7.0 |
| `'Xavier'` | 7.2 |
| `'Turing'` | 7.5 |
| `'Ampere'` | 8.0, 8.6 |
Examples:
@ -152,7 +145,7 @@ function `CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable, [list of CUDA compute archit
_Since: 0.50.0_
``` meson
cuda.nvcc_arch_readable(nvcc_or_version, ...,
cuda.nvcc_arch_readable(cuda_version_string, ...,
detected: string_or_array)
```
@ -162,7 +155,7 @@ architectures that will be compiled for. The output of this function is solely
intended for informative message printing.
archs = '3.0 3.5 5.0+PTX'
readable = cuda.nvcc_arch_readable(nvcc, archs)
readable = cuda.nvcc_arch_readable('10.0', archs)
message('Building for architectures ' + ' '.join(readable))
This will print
@ -178,11 +171,11 @@ _Note:_ This function is intended to closely replicate CMake's FindCUDA module f
_Since: 0.50.0_
``` meson
cuda.min_driver_version(nvcc_or_version)
cuda.min_driver_version(cuda_version_string)
```
Returns the minimum NVIDIA proprietary driver version required, on the host
system, by kernels compiled with the given NVCC compiler or its version string.
system, by kernels compiled with a CUDA Toolkit with the given version string.
The output of this function is generally intended for informative message
printing, but could be used for assertions or to conditionally enable

@ -34,16 +34,19 @@ class CudaModule(ExtensionModule):
@noKwargs
def min_driver_version(self, state, args, kwargs):
argerror = InvalidArguments('min_driver_version must have exactly one positional argument: ' +
'an NVCC compiler object, or its version string.')
'a CUDA Toolkit version string. Beware that, since CUDA 11.0, ' +
'the CUDA Toolkit\'s components (including NVCC) are versioned ' +
'independently from each other (and the CUDA Toolkit as a whole).')
if len(args) != 1:
raise argerror
else:
cuda_version = self._version_from_compiler(args[0])
if cuda_version == 'unknown':
if len(args) != 1 or not isinstance(args[0], str):
raise argerror
cuda_version = args[0]
driver_version_table = [
{'cuda_version': '>=11.1.0', 'windows': '456.38', 'linux': '455.23'},
{'cuda_version': '>=11.0.3', 'windows': '451.82', 'linux': '450.51.06'},
{'cuda_version': '>=11.0.2', 'windows': '451.48', 'linux': '450.51.05'},
{'cuda_version': '>=11.0.1', 'windows': '451.22', 'linux': '450.36.06'},
{'cuda_version': '>=10.2.89', 'windows': '441.22', 'linux': '440.33'},
{'cuda_version': '>=10.1.105', 'windows': '418.96', 'linux': '418.39'},
{'cuda_version': '>=10.0.130', 'windows': '411.31', 'linux': '410.48'},
@ -128,19 +131,45 @@ class CudaModule(ExtensionModule):
return cuda_version, arch_list, detected
def _filter_cuda_arch_list(self, cuda_arch_list, lo=None, hi=None, saturate=None):
"""
Filter CUDA arch list (no codenames) for >= low and < hi architecture
bounds, and deduplicate.
If saturate is provided, architectures >= hi are replaced with saturate.
"""
filtered_cuda_arch_list = []
for arch in cuda_arch_list:
if arch:
if lo and version_compare(arch, '<' + lo):
continue
if hi and version_compare(arch, '>=' + hi):
if not saturate:
continue
arch = saturate
if arch not in filtered_cuda_arch_list:
filtered_cuda_arch_list.append(arch)
return filtered_cuda_arch_list
def _nvcc_arch_flags(self, cuda_version, cuda_arch_list='Auto', detected=''):
"""
Using the CUDA Toolkit version (the NVCC version) and the target
architectures, compute the NVCC architecture flags.
Using the CUDA Toolkit version and the target architectures, compute
the NVCC architecture flags.
"""
# Replicates much of the logic of
# https://github.com/Kitware/CMake/blob/master/Modules/FindCUDA/select_compute_arch.cmake
# except that a bug with cuda_arch_list="All" is worked around by
# tracking both lower and upper limits on GPU architectures.
cuda_known_gpu_architectures = ['Fermi', 'Kepler', 'Maxwell'] # noqa: E221
cuda_common_gpu_architectures = ['3.0', '3.5', '5.0'] # noqa: E221
cuda_limit_gpu_architecture = None # noqa: E221
cuda_hi_limit_gpu_architecture = None # noqa: E221
cuda_lo_limit_gpu_architecture = '2.0' # noqa: E221
cuda_all_gpu_architectures = ['3.0', '3.2', '3.5', '5.0'] # noqa: E221
if version_compare(cuda_version, '<7.0'):
cuda_limit_gpu_architecture = '5.2'
cuda_hi_limit_gpu_architecture = '5.2'
if version_compare(cuda_version, '>=7.0'):
cuda_known_gpu_architectures += ['Kepler+Tegra', 'Kepler+Tesla', 'Maxwell+Tegra'] # noqa: E221
@ -148,7 +177,7 @@ class CudaModule(ExtensionModule):
if version_compare(cuda_version, '<8.0'):
cuda_common_gpu_architectures += ['5.2+PTX'] # noqa: E221
cuda_limit_gpu_architecture = '6.0' # noqa: E221
cuda_hi_limit_gpu_architecture = '6.0' # noqa: E221
if version_compare(cuda_version, '>=8.0'):
cuda_known_gpu_architectures += ['Pascal', 'Pascal+Tegra'] # noqa: E221
@ -157,23 +186,45 @@ class CudaModule(ExtensionModule):
if version_compare(cuda_version, '<9.0'):
cuda_common_gpu_architectures += ['6.1+PTX'] # noqa: E221
cuda_limit_gpu_architecture = '7.0' # noqa: E221
cuda_hi_limit_gpu_architecture = '7.0' # noqa: E221
if version_compare(cuda_version, '>=9.0'):
cuda_known_gpu_architectures += ['Volta', 'Xavier'] # noqa: E221
cuda_common_gpu_architectures += ['7.0', '7.0+PTX'] # noqa: E221
cuda_all_gpu_architectures += ['7.0', '7.0+PTX', '7.2', '7.2+PTX'] # noqa: E221
cuda_common_gpu_architectures += ['7.0'] # noqa: E221
cuda_all_gpu_architectures += ['7.0', '7.2'] # noqa: E221
# https://docs.nvidia.com/cuda/archive/9.0/cuda-toolkit-release-notes/index.html#unsupported-features
cuda_lo_limit_gpu_architecture = '3.0' # noqa: E221
if version_compare(cuda_version, '<10.0'):
cuda_limit_gpu_architecture = '7.5'
cuda_common_gpu_architectures += ['7.2+PTX'] # noqa: E221
cuda_hi_limit_gpu_architecture = '8.0' # noqa: E221
if version_compare(cuda_version, '>=10.0'):
cuda_known_gpu_architectures += ['Turing'] # noqa: E221
cuda_common_gpu_architectures += ['7.5', '7.5+PTX'] # noqa: E221
cuda_all_gpu_architectures += ['7.5', '7.5+PTX'] # noqa: E221
cuda_common_gpu_architectures += ['7.5'] # noqa: E221
cuda_all_gpu_architectures += ['7.5'] # noqa: E221
if version_compare(cuda_version, '<11.0'):
cuda_limit_gpu_architecture = '8.0'
cuda_common_gpu_architectures += ['7.5+PTX'] # noqa: E221
cuda_hi_limit_gpu_architecture = '8.0' # noqa: E221
if version_compare(cuda_version, '>=11.0'):
cuda_known_gpu_architectures += ['Ampere'] # noqa: E221
cuda_common_gpu_architectures += ['8.0'] # noqa: E221
cuda_all_gpu_architectures += ['8.0'] # noqa: E221
# https://docs.nvidia.com/cuda/archive/11.0/cuda-toolkit-release-notes/index.html#deprecated-features
cuda_lo_limit_gpu_architecture = '3.5' # noqa: E221
if version_compare(cuda_version, '<11.1'):
cuda_common_gpu_architectures += ['8.0+PTX'] # noqa: E221
cuda_hi_limit_gpu_architecture = '8.6' # noqa: E221
if version_compare(cuda_version, '>=11.1'):
cuda_common_gpu_architectures += ['8.6', '8.6+PTX'] # noqa: E221
cuda_all_gpu_architectures += ['8.6'] # noqa: E221
if version_compare(cuda_version, '<12.0'):
cuda_hi_limit_gpu_architecture = '9.0' # noqa: E221
if not cuda_arch_list:
cuda_arch_list = 'Auto'
@ -188,16 +239,10 @@ class CudaModule(ExtensionModule):
cuda_arch_list = detected
else:
cuda_arch_list = self._break_arch_string(detected)
if cuda_limit_gpu_architecture:
filtered_cuda_arch_list = []
for arch in cuda_arch_list:
if arch:
if version_compare(arch, '>=' + cuda_limit_gpu_architecture):
arch = cuda_common_gpu_architectures[-1]
if arch not in filtered_cuda_arch_list:
filtered_cuda_arch_list.append(arch)
cuda_arch_list = filtered_cuda_arch_list
cuda_arch_list = self._filter_cuda_arch_list(cuda_arch_list,
cuda_lo_limit_gpu_architecture,
cuda_hi_limit_gpu_architecture,
cuda_common_gpu_architectures[-1])
else:
cuda_arch_list = cuda_common_gpu_architectures
elif isinstance(cuda_arch_list, str):
@ -229,6 +274,7 @@ class CudaModule(ExtensionModule):
'Volta': (['7.0'], ['7.0']),
'Xavier': (['7.2'], []),
'Turing': (['7.5'], ['7.5']),
'Ampere': (['8.0'], ['8.0']),
}.get(arch_name, (None, None))
if arch_bin is None:
@ -242,10 +288,6 @@ class CudaModule(ExtensionModule):
arch_ptx = arch_bin
cuda_arch_ptx += arch_ptx
cuda_arch_bin = re.sub('\\.', '', ' '.join(cuda_arch_bin))
cuda_arch_ptx = re.sub('\\.', '', ' '.join(cuda_arch_ptx))
cuda_arch_bin = re.findall('[0-9()]+', cuda_arch_bin)
cuda_arch_ptx = re.findall('[0-9]+', cuda_arch_ptx)
cuda_arch_bin = sorted(list(set(cuda_arch_bin)))
cuda_arch_ptx = sorted(list(set(cuda_arch_ptx)))
@ -253,15 +295,37 @@ class CudaModule(ExtensionModule):
nvcc_archs_readable = []
for arch in cuda_arch_bin:
m = re.match('([0-9]+)\\(([0-9]+)\\)', arch)
if m:
nvcc_flags += ['-gencode', 'arch=compute_' + m[2] + ',code=sm_' + m[1]]
nvcc_archs_readable += ['sm_' + m[1]]
arch, codev = re.fullmatch(
'([0-9]+\\.[0-9])(?:\\(([0-9]+\\.[0-9])\\))?', arch).groups()
if version_compare(arch, '<' + cuda_lo_limit_gpu_architecture):
continue
if version_compare(arch, '>=' + cuda_hi_limit_gpu_architecture):
continue
if codev:
arch = arch.replace('.', '')
codev = codev.replace('.', '')
nvcc_flags += ['-gencode', 'arch=compute_' + codev + ',code=sm_' + arch]
nvcc_archs_readable += ['sm_' + arch]
else:
arch = arch.replace('.', '')
nvcc_flags += ['-gencode', 'arch=compute_' + arch + ',code=sm_' + arch]
nvcc_archs_readable += ['sm_' + arch]
for arch in cuda_arch_ptx:
arch, codev = re.fullmatch(
'([0-9]+\\.[0-9])(?:\\(([0-9]+\\.[0-9])\\))?', arch).groups()
if codev:
arch = codev
if version_compare(arch, '<' + cuda_lo_limit_gpu_architecture):
continue
if version_compare(arch, '>=' + cuda_hi_limit_gpu_architecture):
continue
arch = arch.replace('.', '')
nvcc_flags += ['-gencode', 'arch=compute_' + arch + ',code=compute_' + arch]
nvcc_archs_readable += ['compute_' + arch]

@ -3,9 +3,9 @@ project('cudamodule', 'cuda', version : '1.0.0')
nvcc = meson.get_compiler('cuda')
cuda = import('unstable-cuda')
arch_flags = cuda.nvcc_arch_flags(nvcc, 'Auto', detected: ['6.0'])
arch_readable = cuda.nvcc_arch_readable(nvcc, 'Auto', detected: ['6.0'])
driver_version = cuda.min_driver_version(nvcc)
arch_flags = cuda.nvcc_arch_flags(nvcc.version(), 'Auto', detected: ['6.0'])
arch_readable = cuda.nvcc_arch_readable(nvcc.version(), 'Auto', detected: ['6.0'])
driver_version = cuda.min_driver_version(nvcc.version())
message('NVCC version: ' + nvcc.version())
message('NVCC flags: ' + ' '.join(arch_flags))
@ -14,3 +14,57 @@ message('Driver version: >=' + driver_version)
exe = executable('prog', 'prog.cu', cuda_args: arch_flags)
test('cudatest', exe)
#
# Assert Series
#
# Sanity test.
assert(' '.join(cuda.nvcc_arch_flags('11.1', '8.6')) ==
'-gencode arch=compute_86,code=sm_86')
# CUDA Toolkit too old, flag filtered out.
assert(' '.join(cuda.nvcc_arch_flags('11.0', '8.6')) ==
'')
# Named architectures.
assert(' '.join(cuda.nvcc_arch_flags('11.0', 'Ampere')) ==
'-gencode arch=compute_80,code=sm_80')
# Splitting & deduplication.
assert(' '.join(cuda.nvcc_arch_flags('11.0', 'Ampere;8.0,8.0')) ==
'-gencode arch=compute_80,code=sm_80')
# Same, but list supplied as list.
assert(' '.join(cuda.nvcc_arch_flags('11.0', ['Ampere', '8.0', '8.0'])) ==
'-gencode arch=compute_80,code=sm_80')
# Same, but mode set to Auto with detected set to a string with a variety of separators.
assert(' '.join(cuda.nvcc_arch_flags('11.0', 'Auto', detected: 'Ampere;8.0,8.0')) ==
'-gencode arch=compute_80,code=sm_80')
# Same, but detected set to a list.
assert(' '.join(cuda.nvcc_arch_flags('11.0', 'Auto', detected: ['Ampere', '8.0', '8.0'])) ==
'-gencode arch=compute_80,code=sm_80')
# Ask for 8.6 binary with 8.0-level PTX.
assert(' '.join(cuda.nvcc_arch_flags('11.1', '8.6(8.0)')) ==
'-gencode arch=compute_80,code=sm_86')
# Same, but keep the 8.0 PTX.
assert(' '.join(cuda.nvcc_arch_flags('11.1', '8.6(8.0)+PTX')) ==
'-gencode arch=compute_80,code=sm_86 -gencode arch=compute_80,code=compute_80')
# Detected Ampere RTX 3090 on CUDA 10.2, saturate to 7.5+PTX
assert(' '.join(cuda.nvcc_arch_flags('10.2', 'Auto', detected: ['8.0'])) ==
'-gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75')
# Failed to auto-detect with CUDA 10.2, default to common GPUs (3.0;3.5;5.0;5.2;6.0;6.1;7.0;7.5+PTX)
assert(' '.join(cuda.nvcc_arch_flags('10.2', 'Auto', detected: [])) ==
'-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 '+
'-gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 '+
'-gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 '+
'-gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 '+
'-gencode arch=compute_75,code=compute_75')

Loading…
Cancel
Save