meson/mesonbuild/compilers/cuda.py

# SPDX-License-Identifier: Apache-2.0
# Copyright 2012-2017 The Meson development team

from __future__ import annotations

import enum
import os.path
import string
import typing as T

from .. import coredata
from .. import options
from .. import mlog
from ..mesonlib import (
    EnvironmentException, Popen_safe,
    is_windows, LibType, version_compare
)
from ..options import OptionKey
from .compilers import Compiler

if T.TYPE_CHECKING:
    from .compilers import CompileCheckMode
    from ..build import BuildTarget
    from ..coredata import MutableKeyedOptionDictType, KeyedOptionDictType
    from ..dependencies import Dependency
    from ..environment import Environment  # noqa: F401
    from ..envconfig import MachineInfo
    from ..linkers.linkers import DynamicLinker
    from ..mesonlib import MachineChoice


cuda_optimization_args: T.Dict[str, T.List[str]] = {
    'plain': [],
    '0': ['-G'],
    'g': ['-O0'],
    '1': ['-O1'],
    '2': ['-O2', '-lineinfo'],
    '3': ['-O3'],
    's': ['-O3']
}

cuda_debug_args: T.Dict[bool, T.List[str]] = {
    False: [],
    True: ['-g']
}


class Phase(enum.Enum):

    COMPILER = 'compiler'
    LINKER = 'linker'


class CudaCompiler(Compiler):

    LINKER_PREFIX = '-Xlinker='
    language = 'cuda'

    # NVCC flags taking no arguments.
    _FLAG_PASSTHRU_NOARGS = {
        # NVCC --long-option,                   NVCC -short-option              CUDA Toolkit 11.2.1 Reference
        '--objdir-as-tempdir',                  '-objtemp',                     # 4.2.1.2
        '--generate-dependency-targets',        '-MP',                          # 4.2.1.12
        '--allow-unsupported-compiler',         '-allow-unsupported-compiler',  # 4.2.1.14
        '--link',                                                               # 4.2.2.1
        '--lib',                                '-lib',                         # 4.2.2.2
        '--device-link',                        '-dlink',                       # 4.2.2.3
        '--device-c',                           '-dc',                          # 4.2.2.4
        '--device-w',                           '-dw',                          # 4.2.2.5
        '--cuda',                               '-cuda',                        # 4.2.2.6
        '--compile',                            '-c',                           # 4.2.2.7
        '--fatbin',                             '-fatbin',                      # 4.2.2.8
        '--cubin',                              '-cubin',                       # 4.2.2.9
        '--ptx',                                '-ptx',                         # 4.2.2.10
        '--preprocess',                         '-E',                           # 4.2.2.11
        '--generate-dependencies',              '-M',                           # 4.2.2.12
        '--generate-nonsystem-dependencies',    '-MM',                          # 4.2.2.13
        '--generate-dependencies-with-compile', '-MD',                          # 4.2.2.14
        '--generate-nonsystem-dependencies-with-compile', '-MMD',               # 4.2.2.15
        '--run',                                                                # 4.2.2.16
        '--profile',                            '-pg',                          # 4.2.3.1
        '--debug',                              '-g',                           # 4.2.3.2
        '--device-debug',                       '-G',                           # 4.2.3.3
        '--extensible-whole-program',           '-ewp',                         # 4.2.3.4
        '--generate-line-info',                 '-lineinfo',                    # 4.2.3.5
        '--dlink-time-opt',                     '-dlto',                        # 4.2.3.8
        '--no-exceptions',                      '-noeh',                        # 4.2.3.11
        '--shared',                             '-shared',                      # 4.2.3.12
        '--no-host-device-initializer-list',    '-nohdinitlist',                # 4.2.3.15
        '--expt-relaxed-constexpr',             '-expt-relaxed-constexpr',      # 4.2.3.16
        '--extended-lambda',                    '-extended-lambda',             # 4.2.3.17
        '--expt-extended-lambda',               '-expt-extended-lambda',        # 4.2.3.18
        '--m32',                                '-m32',                         # 4.2.3.20
        '--m64',                                '-m64',                         # 4.2.3.21
        '--forward-unknown-to-host-compiler',   '-forward-unknown-to-host-compiler', # 4.2.5.1
        '--forward-unknown-to-host-linker',     '-forward-unknown-to-host-linker',   # 4.2.5.2
        '--dont-use-profile',                   '-noprof',                      # 4.2.5.3
        '--dryrun',                             '-dryrun',                      # 4.2.5.5
        '--verbose',                            '-v',                           # 4.2.5.6
        '--keep',                               '-keep',                        # 4.2.5.7
        '--save-temps',                         '-save-temps',                  # 4.2.5.9
        '--clean-targets',                      '-clean',                       # 4.2.5.10
        '--no-align-double',                                                    # 4.2.5.16
        '--no-device-link',                     '-nodlink',                     # 4.2.5.17
        '--allow-unsupported-compiler',         '-allow-unsupported-compiler',  # 4.2.5.18
        '--use_fast_math',                      '-use_fast_math',               # 4.2.7.7
        '--extra-device-vectorization',         '-extra-device-vectorization',  # 4.2.7.12
        '--compile-as-tools-patch',             '-astoolspatch',                # 4.2.7.13
        '--keep-device-functions',              '-keep-device-functions',       # 4.2.7.14
        '--disable-warnings',                   '-w',                           # 4.2.8.1
        '--source-in-ptx',                      '-src-in-ptx',                  # 4.2.8.2
        '--restrict',                           '-restrict',                    # 4.2.8.3
        '--Wno-deprecated-gpu-targets',         '-Wno-deprecated-gpu-targets',  # 4.2.8.4
        '--Wno-deprecated-declarations',        '-Wno-deprecated-declarations', # 4.2.8.5
        '--Wreorder',                           '-Wreorder',                    # 4.2.8.6
        '--Wdefault-stream-launch',             '-Wdefault-stream-launch',      # 4.2.8.7
        '--Wext-lambda-captures-this',          '-Wext-lambda-captures-this',   # 4.2.8.8
        '--display-error-number',               '-err-no',                      # 4.2.8.10
        '--resource-usage',                     '-res-usage',                   # 4.2.8.14
        '--help',                               '-h',                           # 4.2.8.15
        '--version',                            '-V',                           # 4.2.8.16
        '--list-gpu-code',                      '-code-ls',                     # 4.2.8.20
        '--list-gpu-arch',                      '-arch-ls',                     # 4.2.8.21
    }
    # Dictionary of NVCC flags taking either one argument or a comma-separated list.
    # Maps --long to -short options, because the short options are more GCC-like.
    _FLAG_LONG2SHORT_WITHARGS = {
        '--output-file':                        '-o',                           # 4.2.1.1
        '--pre-include':                        '-include',                     # 4.2.1.3
        '--library':                            '-l',                           # 4.2.1.4
        '--define-macro':                       '-D',                           # 4.2.1.5
        '--undefine-macro':                     '-U',                           # 4.2.1.6
        '--include-path':                       '-I',                           # 4.2.1.7
        '--system-include':                     '-isystem',                     # 4.2.1.8
        '--library-path':                       '-L',                           # 4.2.1.9
        '--output-directory':                   '-odir',                        # 4.2.1.10
        '--dependency-output':                  '-MF',                          # 4.2.1.11
        '--compiler-bindir':                    '-ccbin',                       # 4.2.1.13
        '--archiver-binary':                    '-arbin',                       # 4.2.1.15
        '--cudart':                             '-cudart',                      # 4.2.1.16
        '--cudadevrt':                          '-cudadevrt',                   # 4.2.1.17
        '--libdevice-directory':                '-ldir',                        # 4.2.1.18
        '--target-directory':                   '-target-dir',                  # 4.2.1.19
        '--optimization-info':                  '-opt-info',                    # 4.2.3.6
        '--optimize':                           '-O',                           # 4.2.3.7
        '--ftemplate-backtrace-limit':          '-ftemplate-backtrace-limit',   # 4.2.3.9
        '--ftemplate-depth':                    '-ftemplate-depth',             # 4.2.3.10
        '--x':                                  '-x',                           # 4.2.3.13
        '--std':                                '-std',                         # 4.2.3.14
        '--machine':                            '-m',                           # 4.2.3.19
        '--compiler-options':                   '-Xcompiler',                   # 4.2.4.1
        '--linker-options':                     '-Xlinker',                     # 4.2.4.2
        '--archive-options':                    '-Xarchive',                    # 4.2.4.3
        '--ptxas-options':                      '-Xptxas',                      # 4.2.4.4
        '--nvlink-options':                     '-Xnvlink',                     # 4.2.4.5
        '--threads':                            '-t',                           # 4.2.5.4
        '--keep-dir':                           '-keep-dir',                    # 4.2.5.8
        '--run-args':                           '-run-args',                    # 4.2.5.11
        '--input-drive-prefix':                 '-idp',                         # 4.2.5.12
        '--dependency-drive-prefix':            '-ddp',                         # 4.2.5.13
        '--drive-prefix':                       '-dp',                          # 4.2.5.14
        '--dependency-target-name':             '-MT',                          # 4.2.5.15
        '--default-stream':                     '-default-stream',              # 4.2.6.1
        '--gpu-architecture':                   '-arch',                        # 4.2.7.1
        '--gpu-code':                           '-code',                        # 4.2.7.2
        '--generate-code':                      '-gencode',                     # 4.2.7.3
        '--relocatable-device-code':            '-rdc',                         # 4.2.7.4
        '--entries':                            '-e',                           # 4.2.7.5
        '--maxrregcount':                       '-maxrregcount',                # 4.2.7.6
        '--ftz':                                '-ftz',                         # 4.2.7.8
        '--prec-div':                           '-prec-div',                    # 4.2.7.9
        '--prec-sqrt':                          '-prec-sqrt',                   # 4.2.7.10
        '--fmad':                               '-fmad',                        # 4.2.7.11
        '--Werror':                             '-Werror',                      # 4.2.8.9
        '--diag-error':                         '-diag-error',                  # 4.2.8.11
        '--diag-suppress':                      '-diag-suppress',               # 4.2.8.12
        '--diag-warn':                          '-diag-warn',                   # 4.2.8.13
        '--options-file':                       '-optf',                        # 4.2.8.17
        '--time':                               '-time',                        # 4.2.8.18
        '--qpp-config':                         '-qpp-config',                  # 4.2.8.19
    }
    # Reverse map -short to --long options.
    _FLAG_SHORT2LONG_WITHARGS = {v: k for k, v in _FLAG_LONG2SHORT_WITHARGS.items()}

    id = 'nvcc'

    def __init__(self, ccache: T.List[str], exelist: T.List[str], version: str, for_machine: MachineChoice,
                 is_cross: bool,
                 host_compiler: Compiler, info: 'MachineInfo',
                 linker: T.Optional['DynamicLinker'] = None,
                 full_version: T.Optional[str] = None):
        super().__init__(ccache, exelist, version, for_machine, info, linker=linker, full_version=full_version, is_cross=is_cross)
        self.host_compiler = host_compiler
        self.base_options = host_compiler.base_options
        # -Wpedantic generates useless churn due to nvcc's dual compilation model producing
        # a temporary host C++ file that includes gcc-style line directives:
        # https://stackoverflow.com/a/31001220
        self.warn_args = {
            level: self._to_host_flags(list(f for f in flags if f != '-Wpedantic'))
            for level, flags in host_compiler.warn_args.items()
        }
        self.host_werror_args = ['-Xcompiler=' + x for x in self.host_compiler.get_werror_args()]

    @classmethod
    def _shield_nvcc_list_arg(cls, arg: str, listmode: bool = True) -> str:
        r"""
        Shield an argument against both splitting by NVCC's list-argument
        parse logic, and interpretation by any shell.

        NVCC seems to consider every comma , that is neither escaped by \ nor inside
        a double-quoted string a split-point. Single-quotes do not provide protection
        against splitting; In fact, after splitting they are \-escaped. Unfortunately,
        double-quotes don't protect against shell expansion. What follows is a
        complex dance to accommodate everybody.
        """

        SQ = "'"
        DQ = '"'
        CM = ","
        BS = "\\"
        DQSQ = DQ+SQ+DQ
        quotable = set(string.whitespace+'"$`\\')

        if CM not in arg or not listmode:
            if SQ not in arg:
                # If any of the special characters "$`\ or whitespace are present, single-quote.
                # Otherwise return bare.
                if set(arg).intersection(quotable):
                    return SQ+arg+SQ
                else:
                    return arg # Easy case: no splits, no quoting.
            else:
                # There are single quotes. Double-quote them, and single-quote the
                # strings between them.
                l = [cls._shield_nvcc_list_arg(s) for s in arg.split(SQ)]
                l = sum([[s, DQSQ] for s in l][:-1], [])  # Interleave l with DQSQs
                return ''.join(l)
        else:
            # A comma is present, and list mode was active.
            # We apply (what we guess is) the (primitive) NVCC splitting rule:
            l = ['']
            instring = False
            argit = iter(arg)
            for c in argit:
                if c == CM and not instring:
                    l.append('')
                elif c == DQ:
                    l[-1] += c
                    instring = not instring
                elif c == BS:
                    try:
                        l[-1] += next(argit)
                    except StopIteration:
                        break
                else:
                    l[-1] += c

            # Shield individual strings, without listmode, then return them with
            # escaped commas between them.
            l = [cls._shield_nvcc_list_arg(s, listmode=False) for s in l]
            return r'\,'.join(l)

    @classmethod
    def _merge_flags(cls, flags: T.List[str]) -> T.List[str]:
        r"""
        The flags to NVCC gets exceedingly verbose and unreadable when too many of them
        are shielded with -Xcompiler. Merge consecutive -Xcompiler-wrapped arguments
        into one.
        """
        if len(flags) <= 1:
            return flags
        flagit = iter(flags)
        xflags = []

        def is_xcompiler_flag_isolated(flag: str) -> bool:
            return flag == '-Xcompiler'

        def is_xcompiler_flag_glued(flag: str) -> bool:
            return flag.startswith('-Xcompiler=')

        def is_xcompiler_flag(flag: str) -> bool:
            return is_xcompiler_flag_isolated(flag) or is_xcompiler_flag_glued(flag)

        def get_xcompiler_val(flag: str, flagit: T.Iterator[str]) -> str:
            if is_xcompiler_flag_glued(flag):
                return flag[len('-Xcompiler='):]
            else:
                try:
                    return next(flagit)
                except StopIteration:
                    return ""

        ingroup = False
        for flag in flagit:
            if not is_xcompiler_flag(flag):
                ingroup = False
                xflags.append(flag)
            elif ingroup:
                xflags[-1] += ','
                xflags[-1] += get_xcompiler_val(flag, flagit)
            elif is_xcompiler_flag_isolated(flag):
                ingroup = True
                xflags.append(flag)
                xflags.append(get_xcompiler_val(flag, flagit))
            elif is_xcompiler_flag_glued(flag):
                ingroup = True
                xflags.append(flag)
            else:
                raise ValueError("-Xcompiler flag merging failed, unknown argument form!")
        return xflags

    @classmethod
    def to_host_flags_base(cls, flags: T.List[str], phase: Phase = Phase.COMPILER, default_include_dirs: T.Optional[T.List[str]] = None) -> T.List[str]:
        """
        Translate generic "GCC-speak" plus particular "NVCC-speak" flags to NVCC flags.

        NVCC's "short" flags have broad similarities to the GCC standard, but have
        gratuitous, irritating differences.
        """
        xflags = []
        flagit = iter(flags)

        for flag in flagit:
            # The CUDA Toolkit Documentation, in 4.1. Command Option Types and Notation,
            # specifies that NVCC does not parse the standard flags as GCC does. It has
            # its own strategy, to wit:
            #
            #     nvcc recognizes three types of command options: boolean options, single
            #     value options, and list options.
            #
            #     Boolean options do not have an argument; they are either specified on a
            #     command line or not. Single value options must be specified at most once,
            #     and list options may be repeated. Examples of each of these option types
            #     are, respectively: --verbose (switch to verbose mode), --output-file
            #     (specify output file), and --include-path (specify include path).
            #
            #     Single value options and list options must have arguments, which must
            #     follow the name of the option itself by either one of more spaces or an
            #     equals character. When a one-character short name such as -I, -l, and -L
            #     is used, the value of the option may also immediately follow the option
            #     itself without being separated by spaces or an equal character. The
            #     individual values of list options may be separated by commas in a single
            #     instance of the option, or the option may be repeated, or any
            #     combination of these two cases.
            #
            # One strange consequence of this choice is that directory and filenames that
            # contain commas (',') cannot be passed to NVCC (at least, not as easily as
            # in GCC). Another strange consequence is that it is legal to supply flags
            # such as
            #
            #     -lpthread,rt,dl,util
            #     -l pthread,rt,dl,util
            #     -l=pthread,rt,dl,util
            #
            # and each of the above alternatives is equivalent to GCC-speak
            #
            #     -lpthread -lrt -ldl -lutil
            #     -l pthread -l rt -l dl -l util
            #     -l=pthread -l=rt -l=dl -l=util
            #
            # *With the exception of commas in the name*, GCC-speak for these list flags
            # is a strict subset of NVCC-speak, so we passthrough those flags.
            #
            # The -D macro-define flag is documented as somehow shielding commas from
            # splitting a definition. Balanced parentheses, braces and single-quotes
            # around the comma are not sufficient, but balanced double-quotes are. The
            # shielding appears to work with -l, -I, -L flags as well, for instance.
            #
            # Since our goal is to replicate GCC-speak as much as possible, we check for
            # commas in all list-arguments and shield them with double-quotes. We make
            # an exception for -D (where this would be value-changing) and -U (because
            # it isn't possible to define a macro with a comma in the name).

            if flag in cls._FLAG_PASSTHRU_NOARGS:
                xflags.append(flag)
                continue

            # Handle breakup of flag-values into a flag-part and value-part.
            if flag[:1] not in '-/':
                # This is not a flag. It's probably a file input. Pass it through.
                xflags.append(flag)
                continue
            elif flag[:1] == '/':
                # This is ambiguously either an MVSC-style /switch or an absolute path
                # to a file. For some magical reason the following works acceptably in
                # both cases.
                # We only want to prefix arguments that are NOT static archives, since
                # the latter could contain relocatable device code (-dc/-rdc=true).
                prefix = '' if flag.endswith('.a') else f'-X{phase.value}='
                wrap = '"' if ',' in flag else ''
                xflags.append(f'{prefix}{wrap}{flag}{wrap}')
                continue
            elif len(flag) >= 2 and flag[0] == '-' and flag[1] in 'IDULlmOxmte':
                # This is a single-letter short option. These options (with the
                # exception of -o) are allowed to receive their argument with neither
                # space nor = sign before them. Detect and separate them in that event.
                if flag[2:3] == '':            # -I something
                    try:
                        val = next(flagit)
                    except StopIteration:
                        pass
                elif flag[2:3] == '=':           # -I=something
                    val = flag[3:]
                else:                            # -Isomething
                    val = flag[2:]
                flag = flag[:2]                  # -I
            elif flag in cls._FLAG_LONG2SHORT_WITHARGS or \
                    flag in cls._FLAG_SHORT2LONG_WITHARGS:
                # This is either -o or a multi-letter flag, and it is receiving its
                # value isolated.
                try:
                    val = next(flagit)           # -o something
                except StopIteration:
                    pass
            elif flag.split('=', 1)[0] in cls._FLAG_LONG2SHORT_WITHARGS or \
                    flag.split('=', 1)[0] in cls._FLAG_SHORT2LONG_WITHARGS:
                # This is either -o or a multi-letter flag, and it is receiving its
                # value after an = sign.
                flag, val = flag.split('=', 1)    # -o=something
            # Some dependencies (e.g., BoostDependency) add unspaced "-isystem/usr/include" arguments
            elif flag.startswith('-isystem'):
                val = flag[8:].strip()
                flag = flag[:8]
            else:
                # This is a flag, and it's foreign to NVCC.
                #
                # We do not know whether this GCC-speak flag takes an isolated
                # argument. Assuming it does not (the vast majority indeed don't),
                # wrap this argument in an -Xcompiler flag and send it down to NVCC.
                if flag == '-ffast-math':
                    xflags.append('-use_fast_math')
                    xflags.append('-Xcompiler='+flag)
                elif flag == '-fno-fast-math':
                    xflags.append('-ftz=false')
                    xflags.append('-prec-div=true')
                    xflags.append('-prec-sqrt=true')
                    xflags.append('-Xcompiler='+flag)
                elif flag == '-freciprocal-math':
                    xflags.append('-prec-div=false')
                    xflags.append('-Xcompiler='+flag)
                elif flag == '-fno-reciprocal-math':
                    xflags.append('-prec-div=true')
                    xflags.append('-Xcompiler='+flag)
                else:
                    xflags.append('-Xcompiler='+cls._shield_nvcc_list_arg(flag))
                    # The above should securely handle GCC's -Wl, -Wa, -Wp, arguments.
                continue

            assert val is not None  # Should only trip if there is a missing argument.

            # Take care of the various NVCC-supported flags that need special handling.
            flag = cls._FLAG_LONG2SHORT_WITHARGS.get(flag, flag)

            if flag in {'-include', '-isystem', '-I', '-L', '-l'}:
                # These flags are known to GCC, but list-valued in NVCC. They potentially
                # require double-quoting to prevent NVCC interpreting the flags as lists
                # when GCC would not have done so.
                #
                # We avoid doing this quoting for -D to avoid redefining macros and for
                # -U because it isn't possible to define a macro with a comma in the name.
                # -U with comma arguments is impossible in GCC-speak (and thus unambiguous
                #in NVCC-speak, albeit unportable).
                if len(flag) == 2:
                    xflags.append(flag+cls._shield_nvcc_list_arg(val))
                elif flag == '-isystem' and default_include_dirs is not None and val in default_include_dirs:
                    # like GnuLikeCompiler, we have to filter out include directories specified
                    # with -isystem that overlap with the host compiler's search path
                    pass
                else:
                    xflags.append(flag)
                    xflags.append(cls._shield_nvcc_list_arg(val))
            elif flag == '-O':
                # Handle optimization levels GCC knows about that NVCC does not.
                if val == 'fast':
                    xflags.append('-O3')
                    xflags.append('-use_fast_math')
                    xflags.append('-Xcompiler')
                    xflags.append(flag+val)
                elif val in {'s', 'g', 'z'}:
                    xflags.append('-Xcompiler')
                    xflags.append(flag+val)
                else:
                    xflags.append(flag+val)
            elif flag in {'-D', '-U', '-m', '-t'}:
                xflags.append(flag+val)       # For style, keep glued.
            elif flag in {'-std'}:
                xflags.append(flag+'='+val)   # For style, keep glued.
            else:
                xflags.append(flag)
                xflags.append(val)

        return cls._merge_flags(xflags)

    def _to_host_flags(self, flags: T.List[str], phase: Phase = Phase.COMPILER) -> T.List[str]:
        return self.to_host_flags_base(flags, phase, self.host_compiler.get_default_include_dirs())

    def needs_static_linker(self) -> bool:
        return False

    def thread_link_flags(self, environment: 'Environment') -> T.List[str]:
        return self._to_host_flags(self.host_compiler.thread_link_flags(environment), Phase.LINKER)

    def sanity_check(self, work_dir: str, env: 'Environment') -> None:
        mlog.debug('Sanity testing ' + self.get_display_language() + ' compiler:', ' '.join(self.exelist))
        mlog.debug('Is cross compiler: %s.' % str(self.is_cross))

        sname = 'sanitycheckcuda.cu'
        code = r'''
        #include <cuda_runtime.h>
        #include <stdio.h>

        __global__ void kernel (void) {}

        int main(void){
            struct cudaDeviceProp prop;
            int count, i;
            cudaError_t ret = cudaGetDeviceCount(&count);
            if(ret != cudaSuccess){
                fprintf(stderr, "%d\n", (int)ret);
            }else{
                for(i=0;i<count;i++){
                    if(cudaGetDeviceProperties(&prop, i) == cudaSuccess){
                        fprintf(stdout, "%d.%d\n", prop.major, prop.minor);
                    }
                }
            }
            fflush(stderr);
            fflush(stdout);
            return 0;
        }
        '''
        binname = sname.rsplit('.', 1)[0]
        binname += '_cross' if self.is_cross else ''
        source_name = os.path.join(work_dir, sname)
        binary_name = os.path.join(work_dir, binname + '.exe')
        with open(source_name, 'w', encoding='utf-8') as ofile:
            ofile.write(code)

        # The Sanity Test for CUDA language will serve as both a sanity test
        # and a native-build GPU architecture detection test, useful later.
        #
        # For this second purpose, NVCC has very handy flags, --run and
        # --run-args, that allow one to run an application with the
        # environment set up properly. Of course, this only works for native
        # builds; For cross builds we must still use the exe_wrapper (if any).
        self.detected_cc = ''
        flags = []

        # Disable warnings, compile with statically-linked runtime for minimum
        # reliance on the system.
        flags += ['-w', '-cudart', 'static', source_name]

        # Use the -ccbin option, if available, even during sanity checking.
        # Otherwise, on systems where CUDA does not support the default compiler,
        # NVCC becomes unusable.
        flags += self.get_ccbin_args(env.coredata.optstore)

        # If cross-compiling, we can't run the sanity check, only compile it.
        if self.is_cross and not env.has_exe_wrapper():
            # Linking cross built apps is painful. You can't really
            # tell if you should use -nostdlib or not and for example
            # on OSX the compiler binary is the same but you need
            # a ton of compiler flags to differentiate between
            # arm and x86_64. So just compile.
            flags += self.get_compile_only_args()
        flags += self.get_output_args(binary_name)

        # Compile sanity check
        cmdlist = self.exelist + flags
        mlog.debug('Sanity check compiler command line: ', ' '.join(cmdlist))
        pc, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
        mlog.debug('Sanity check compile stdout: ')
        mlog.debug(stdo)
        mlog.debug('-----\nSanity check compile stderr:')
        mlog.debug(stde)
        mlog.debug('-----')
        if pc.returncode != 0:
            raise EnvironmentException(f'Compiler {self.name_string()} cannot compile programs.')

        # Run sanity check (if possible)
        if self.is_cross:
            if not env.has_exe_wrapper():
                return
            else:
                cmdlist = env.exe_wrapper.get_command() + [binary_name]
        else:
            cmdlist = self.exelist + ['--run', '"' + binary_name + '"']
        mlog.debug('Sanity check run command line: ', ' '.join(cmdlist))
        pe, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
        mlog.debug('Sanity check run stdout: ')
        mlog.debug(stdo)
        mlog.debug('-----\nSanity check run stderr:')
        mlog.debug(stde)
        mlog.debug('-----')
        pe.wait()
        if pe.returncode != 0:
            raise EnvironmentException(f'Executables created by {self.language} compiler {self.name_string()} are not runnable.')

        # Interpret the result of the sanity test.
        # As mentioned above, it is not only a sanity test but also a GPU
        # architecture detection test.
        if stde == '':
            self.detected_cc = stdo
        else:
            mlog.debug('cudaGetDeviceCount() returned ' + stde)

    def has_header_symbol(self, hname: str, symbol: str, prefix: str,
                          env: 'Environment', *,
                          extra_args: T.Union[None, T.List[str], T.Callable[[CompileCheckMode], T.List[str]]] = None,
                          dependencies: T.Optional[T.List['Dependency']] = None) -> T.Tuple[bool, bool]:
        if extra_args is None:
            extra_args = []
        fargs = {'prefix': prefix, 'header': hname, 'symbol': symbol}
        # Check if it's a C-like symbol
        t = '''{prefix}
        #include <{header}>
        int main(void) {{
            /* If it's not defined as a macro, try to use as a symbol */
            #ifndef {symbol}
                {symbol};
            #endif
            return 0;
        }}'''
        found, cached = self.compiles(t.format_map(fargs), env, extra_args=extra_args, dependencies=dependencies)
        if found:
            return True, cached
        # Check if it's a class or a template
        t = '''{prefix}
        #include <{header}>
        using {symbol};
        int main(void) {{
            return 0;
        }}'''
        return self.compiles(t.format_map(fargs), env, extra_args=extra_args, dependencies=dependencies)

    _CPP14_VERSION = '>=9.0'
    _CPP17_VERSION = '>=11.0'
    _CPP20_VERSION = '>=12.0'

    def get_options(self) -> 'MutableKeyedOptionDictType':
        cpp_stds = ['none', 'c++03', 'c++11']
        if version_compare(self.version, self._CPP14_VERSION):
            cpp_stds += ['c++14']
        if version_compare(self.version, self._CPP17_VERSION):
            cpp_stds += ['c++17']
        if version_compare(self.version, self._CPP20_VERSION):
            cpp_stds += ['c++20']

        return self.update_options(
            super().get_options(),
            self.create_option(options.UserComboOption,
                               self.form_compileropt_key('std'),
                               'C++ language standard to use with CUDA',
                               cpp_stds,
                               'none'),
            self.create_option(options.UserStringOption,
                               self.form_compileropt_key('ccbindir'),
                               'CUDA non-default toolchain directory to use (-ccbin)',
                               ''),
        )

    def _to_host_compiler_options(self, master_options: 'KeyedOptionDictType') -> 'KeyedOptionDictType':
        """
        Convert an NVCC Option set to a host compiler's option set.
        """

        # We must strip the -std option from the host compiler option set, as NVCC has
        # its own -std flag that may not agree with the host compiler's.
        host_options = {key: master_options.get(key, opt) for key, opt in self.host_compiler.get_options().items()}
        std_key = OptionKey(f'{self.host_compiler.language}_std', machine=self.for_machine)
        overrides = {std_key: 'none'}
        # To shut up mypy.
        return coredata.OptionsView(host_options, overrides=overrides)

    def get_option_compile_args(self, options: 'KeyedOptionDictType') -> T.List[str]:
        args = self.get_ccbin_args(options)
        # On Windows, the version of the C++ standard used by nvcc is dictated by
        # the combination of CUDA version and MSVC version; the --std= is thus ignored
        # and attempting to use it will result in a warning: https://stackoverflow.com/a/51272091/741027
        if not is_windows():
            key = self.form_compileropt_key('std')
            std = options.get_value(key)
            if std != 'none':
                args.append('--std=' + std)

        return args + self._to_host_flags(self.host_compiler.get_option_compile_args(self._to_host_compiler_options(options)))

    def get_option_link_args(self, options: 'KeyedOptionDictType') -> T.List[str]:
        args = self.get_ccbin_args(options)
        return args + self._to_host_flags(self.host_compiler.get_option_link_args(self._to_host_compiler_options(options)), Phase.LINKER)

    def get_soname_args(self, env: 'Environment', prefix: str, shlib_name: str,
                        suffix: str, soversion: str,
                        darwin_versions: T.Tuple[str, str]) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_soname_args(
            env, prefix, shlib_name, suffix, soversion, darwin_versions), Phase.LINKER)

    def get_compile_only_args(self) -> T.List[str]:
        return ['-c']

    def get_no_optimization_args(self) -> T.List[str]:
        return ['-O0']

    def get_optimization_args(self, optimization_level: str) -> T.List[str]:
        # alternatively, consider simply redirecting this to the host compiler, which would
        # give us more control over options like "optimize for space" (which nvcc doesn't support):
        # return self._to_host_flags(self.host_compiler.get_optimization_args(optimization_level))
        return cuda_optimization_args[optimization_level]

    def sanitizer_compile_args(self, value: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.sanitizer_compile_args(value))

    def sanitizer_link_args(self, value: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.sanitizer_link_args(value))

    def get_debug_args(self, is_debug: bool) -> T.List[str]:
        return cuda_debug_args[is_debug]

    def get_werror_args(self) -> T.List[str]:
        device_werror_args = ['-Werror=cross-execution-space-call,deprecated-declarations,reorder']
        return device_werror_args + self.host_werror_args

    def get_warn_args(self, level: str) -> T.List[str]:
        return self.warn_args[level]

    def get_include_args(self, path: str, is_system: bool) -> T.List[str]:
        if path == '':
            path = '.'
        return ['-isystem=' + path] if is_system else ['-I' + path]

    def get_compile_debugfile_args(self, rel_obj: str, pch: bool = False) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_compile_debugfile_args(rel_obj, pch))

    def get_link_debugfile_args(self, targetfile: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_link_debugfile_args(targetfile), Phase.LINKER)

    def get_depfile_suffix(self) -> str:
        return 'd'

    def get_optimization_link_args(self, optimization_level: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_optimization_link_args(optimization_level), Phase.LINKER)

    def build_rpath_args(self, env: 'Environment', build_dir: str, from_dir: str,
                         rpath_paths: T.Tuple[str, ...], build_rpath: str,
                         install_rpath: str) -> T.Tuple[T.List[str], T.Set[bytes]]:
        (rpath_args, rpath_dirs_to_remove) = self.host_compiler.build_rpath_args(
            env, build_dir, from_dir, rpath_paths, build_rpath, install_rpath)
        return (self._to_host_flags(rpath_args, Phase.LINKER), rpath_dirs_to_remove)

    def linker_to_compiler_args(self, args: T.List[str]) -> T.List[str]:
        return args

    def get_pic_args(self) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_pic_args())

    def compute_parameters_with_absolute_paths(self, parameter_list: T.List[str],
                                               build_dir: str) -> T.List[str]:
        return []

    def get_output_args(self, target: str) -> T.List[str]:
        return ['-o', target]

    def get_dependency_gen_args(self, outtarget: str, outfile: str) -> T.List[str]:
        if version_compare(self.version, '>= 10.2'):
            # According to nvcc Documentation, `-MD` option is added after 10.2
            # Reference: [CUDA 10.1](https://docs.nvidia.com/cuda/archive/10.1/cuda-compiler-driver-nvcc/index.html#options-for-specifying-compilation-phase-generate-nonsystem-dependencies)
            # Reference: [CUDA 10.2](https://docs.nvidia.com/cuda/archive/10.2/cuda-compiler-driver-nvcc/index.html#options-for-specifying-compilation-phase-generate-nonsystem-dependencies)
            return ['-MD', '-MT', outtarget, '-MF', outfile]
        else:
            return []

    def get_std_exe_link_args(self) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_std_exe_link_args(), Phase.LINKER)

    def find_library(self, libname: str, env: 'Environment', extra_dirs: T.List[str],
                     libtype: LibType = LibType.PREFER_SHARED, lib_prefix_warning: bool = True) -> T.Optional[T.List[str]]:
        return self.host_compiler.find_library(libname, env, extra_dirs, libtype, lib_prefix_warning)

    def get_crt_compile_args(self, crt_val: str, buildtype: str) -> T.List[str]:
        return self._to_host_flags(self.host_compiler.get_crt_compile_args(crt_val, buildtype))

    def get_crt_link_args(self, crt_val: str, buildtype: str) -> T.List[str]:
        # nvcc defaults to static, release version of msvc runtime and provides no
        # native option to override it; override it with /NODEFAULTLIB
        host_link_arg_overrides = []
        host_crt_compile_args = self.host_compiler.get_crt_compile_args(crt_val, buildtype)
        if any(arg in {'/MDd', '/MD', '/MTd'} for arg in host_crt_compile_args):
            host_link_arg_overrides += ['/NODEFAULTLIB:LIBCMT.lib']
        return self._to_host_flags(host_link_arg_overrides + self.host_compiler.get_crt_link_args(crt_val, buildtype), Phase.LINKER)

    def get_target_link_args(self, target: 'BuildTarget') -> T.List[str]:
        return self._to_host_flags(super().get_target_link_args(target), Phase.LINKER)

    def get_dependency_compile_args(self, dep: 'Dependency') -> T.List[str]:
        return self._to_host_flags(super().get_dependency_compile_args(dep))

    def get_dependency_link_args(self, dep: 'Dependency') -> T.List[str]:
        return self._to_host_flags(super().get_dependency_link_args(dep), Phase.LINKER)

    def get_ccbin_args(self, ccoptions: 'KeyedOptionDictType') -> T.List[str]:
        key = self.form_compileropt_key('ccbindir')
        ccbindir = ccoptions.get_value(key)
        if isinstance(ccbindir, str) and ccbindir != '':
            return [self._shield_nvcc_list_arg('-ccbin='+ccbindir, False)]
        else:
            return []

    def get_profile_generate_args(self) -> T.List[str]:
        return ['-Xcompiler=' + x for x in self.host_compiler.get_profile_generate_args()]

    def get_profile_use_args(self) -> T.List[str]:
        return ['-Xcompiler=' + x for x in self.host_compiler.get_profile_use_args()]

    def get_assert_args(self, disable: bool, env: 'Environment') -> T.List[str]:
        return self.host_compiler.get_assert_args(disable, env)