Merge pull request #4972 from obilaniu/cudafixes

CUDA fixes
6 years ago · 41fb5c2960
parent 5b53335724 104397a429
commit 41fb5c2960
3 changed files with 106 additions and 76 deletions
--- a/docs/markdown/Cuda-module.md
+++ b/docs/markdown/Cuda-module.md
@ -71,6 +71,14 @@ mixed with architecture names or compute capabilities. Their interpretation is:
 | `'Common'`        | Relatively common CCs supported by given NVCC compiler. Generally excludes Tegra and Tesla devices. |
 | `'Auto'`          | The CCs provided by the `detected:` keyword, filtered for support by given NVCC compiler. |
 As a special case, when `nvcc_arch_flags()` is invoked with
 - an NVCC `compiler` object `nvcc`,
 - `'Auto'` mode and
 - no `detected:` keyword,
 Meson uses `nvcc`'s architecture auto-detection results.
 The supported architecture names and their corresponding compute capabilities
 are:
@ -85,7 +93,7 @@ are:
 | `'Pascal'`        | 6.0, 6.1           |
 | `'Pascal+Tegra'`  | 6.2                |
 | `'Volta'`         | 7.0                |
-| `'Volta+Tegra'`   | 7.2                |
+| `'Xavier'`        | 7.2                |
 | `'Turing'`        | 7.5                |
--- a/mesonbuild/compilers/cuda.py
+++ b/mesonbuild/compilers/cuda.py
@ -47,35 +47,97 @@ class CudaCompiler(Compiler):
        return []
    def sanity_check(self, work_dir, environment):
-        source_name = os.path.join(work_dir, 'sanitycheckcuda.cu')
+        mlog.debug('Sanity testing ' + self.get_display_language() + ' compiler:', ' '.join(self.exelist))
-        binary_name = os.path.join(work_dir, 'sanitycheckcuda')
+        mlog.debug('Is cross compiler: %s.' % str(self.is_cross))
        extra_flags = self.get_cross_extra_flags(environment, link=False)
        if self.is_cross:
            extra_flags += self.get_compile_only_args()
        code = '''
 __global__ void kernel (void) {
 }
-        int main(int argc,char** argv){
+        sname = 'sanitycheckcuda.cu'
        code = r'''
        #include <cuda_runtime.h>
        #include <stdio.h>
        __global__ void kernel (void) {}
        int main(void){
            struct cudaDeviceProp prop;
            int count, i;
            cudaError_t ret = cudaGetDeviceCount(&count);
            if(ret != cudaSuccess){
                fprintf(stderr, "%d\n", (int)ret);
            }else{
                for(i=0;i<count;i++){
                    if(cudaGetDeviceProperties(&prop, i) == cudaSuccess){
                        fprintf(stdout, "%d.%d\n", prop.major, prop.minor);
                    }
                }
            }
            fflush(stderr);
            fflush(stdout);
            return 0;
        }
        '''
-
+        binname = sname.rsplit('.', 1)[0]
        binname += '_cross' if self.is_cross else ''
        source_name = os.path.join(work_dir, sname)
        binary_name = os.path.join(work_dir, binname + '.exe')
        with open(source_name, 'w') as ofile:
            ofile.write(code)
-        pc = subprocess.Popen(self.exelist + extra_flags + [source_name, '-o', binary_name])
+
-        pc.wait()
+        # The Sanity Test for CUDA language will serve as both a sanity test
        # and a native-build GPU architecture detection test, useful later.
        #
        # For this second purpose, NVCC has very handy flags, --run and
        # --run-args, that allow one to run an application with the
        # environment set up properly. Of course, this only works for native
        # builds; For cross builds we must still use the exe_wrapper (if any).
        self.detected_cc = ''
        flags = ['-w', '-cudart', 'static', source_name]
        if self.is_cross and self.exe_wrapper is None:
            # Linking cross built apps is painful. You can't really
            # tell if you should use -nostdlib or not and for example
            # on OSX the compiler binary is the same but you need
            # a ton of compiler flags to differentiate between
            # arm and x86_64. So just compile.
            flags += self.get_compile_only_args()
        flags += self.get_output_args(binary_name)
        # Compile sanity check
        cmdlist = self.exelist + flags
        mlog.debug('Sanity check compiler command line: ', ' '.join(cmdlist))
        pc, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
        mlog.debug('Sanity check compile stdout: ')
        mlog.debug(stdo)
        mlog.debug('-----\nSanity check compile stderr:')
        mlog.debug(stde)
        mlog.debug('-----')
        if pc.returncode != 0:
-            raise EnvironmentException('Cuda compiler %s can not compile programs.' % self.name_string())
+            raise EnvironmentException('Compiler {0} can not compile programs.'.format(self.name_string()))
        # Run sanity check (if possible)
        if self.is_cross:
-            # Can't check if the binaries run so we have to assume they do
+            if self.exe_wrapper is None:
-            return
+                return
-        pe = subprocess.Popen(binary_name)
+            else:
                cmdlist = self.exe_wrapper + [binary_name]
        else:
            cmdlist = self.exelist + ['--run', '"' + binary_name + '"']
        mlog.debug('Sanity check run command line: ', ' '.join(cmdlist))
        pe, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
        mlog.debug('Sanity check run stdout: ')
        mlog.debug(stdo)
        mlog.debug('-----\nSanity check run stderr:')
        mlog.debug(stde)
        mlog.debug('-----')
        pe.wait()
        if pe.returncode != 0:
-            raise EnvironmentException('Executables created by Cuda compiler %s are not runnable.' % self.name_string())
+            raise EnvironmentException('Executables created by {0} compiler {1} are not runnable.'.format(self.language, self.name_string()))
        # Interpret the result of the sanity test.
        # As mentionned above, it is not only a sanity test but also a GPU
        # architecture detection test.
        if stde == '':
            self.detected_cc = stdo
        else:
            mlog.debug('cudaGetDeviceCount() returned ' + stde)
    def get_compiler_check_args(self):
        return super().get_compiler_check_args() + []
@ -92,56 +154,6 @@ __global__ void kernel (void) {
        int main () {{ return 0; }}'''
        return self.compiles(t.format(**fargs), env, extra_args, dependencies)
    def sanity_check_impl(self, work_dir, environment, sname, code):
        mlog.debug('Sanity testing ' + self.get_display_language() + ' compiler:', ' '.join(self.exelist))
        mlog.debug('Is cross compiler: %s.' % str(self.is_cross))
        extra_flags = []
        source_name = os.path.join(work_dir, sname)
        binname = sname.rsplit('.', 1)[0]
        if self.is_cross:
            binname += '_cross'
            if self.exe_wrapper is None:
                # Linking cross built apps is painful. You can't really
                # tell if you should use -nostdlib or not and for example
                # on OSX the compiler binary is the same but you need
                # a ton of compiler flags to differentiate between
                # arm and x86_64. So just compile.
                extra_flags += self.get_cross_extra_flags(environment, link=False)
                extra_flags += self.get_compile_only_args()
            else:
                extra_flags += self.get_cross_extra_flags(environment, link=True)
        # Is a valid executable output for all toolchains and platforms
        binname += '.exe'
        # Write binary check source
        binary_name = os.path.join(work_dir, binname)
        with open(source_name, 'w') as ofile:
            ofile.write(code)
        # Compile sanity check
        cmdlist = self.exelist + extra_flags + [source_name] + self.get_output_args(binary_name)
        pc, stdo, stde = Popen_safe(cmdlist, cwd=work_dir)
        mlog.debug('Sanity check compiler command line:', ' '.join(cmdlist))
        mlog.debug('Sanity check compile stdout:')
        mlog.debug(stdo)
        mlog.debug('-----\nSanity check compile stderr:')
        mlog.debug(stde)
        mlog.debug('-----')
        if pc.returncode != 0:
            raise EnvironmentException('Compiler {0} can not compile programs.'.format(self.name_string()))
        # Run sanity check
        if self.is_cross:
            if self.exe_wrapper is None:
                # Can't check if the binaries run so we have to assume they do
                return
            cmdlist = self.exe_wrapper + [binary_name]
        else:
            cmdlist = [binary_name]
        mlog.debug('Running test binary command: ' + ' '.join(cmdlist))
        pe = subprocess.Popen(cmdlist)
        pe.wait()
        if pe.returncode != 0:
            raise EnvironmentException('Executables created by {0} compiler {1} are not runnable.'.format(self.language, self.name_string()))
    @staticmethod
    def _cook_link_args(args):
        """
@ -176,7 +188,7 @@ __global__ void kernel (void) {
        return cuda_debug_args[is_debug]
    def get_werror_args(self):
-        return ['-Werror']
+        return ['-Werror=cross-execution-space-call,deprecated-declarations,reorder']
    def get_linker_exelist(self):
        return self.exelist[:]
--- a/mesonbuild/modules/unstable_cuda.py
+++ b/mesonbuild/modules/unstable_cuda.py
@ -77,10 +77,18 @@ class CudaModule(ExtensionModule):
    @staticmethod
    def _break_arch_string(s):
-        s = re.sub('[ \t,;]+', ';', s)
+        s = re.sub('[ \t\r\n,;]+', ';', s)
        s = s.strip(';').split(';')
        return s
    @staticmethod
    def _detected_cc_from_compiler(c):
        if isinstance(c, CompilerHolder):
            c = c.compiler
        if isinstance(c, CudaCompiler):
            return c.detected_cc
        return ''
    @staticmethod
    def _version_from_compiler(c):
        if isinstance(c, CompilerHolder):
@ -97,7 +105,8 @@ class CudaModule(ExtensionModule):
        if len(args) < 1:
            raise argerror
        else:
-            cuda_version = self._version_from_compiler(args[0])
+            compiler = args[0]
            cuda_version = self._version_from_compiler(compiler)
            if cuda_version == 'unknown':
                raise argerror
@ -108,7 +117,8 @@ class CudaModule(ExtensionModule):
            raise InvalidArguments('''The special architectures 'All', 'Common' and 'Auto' must appear alone, as a positional argument!''')
        arch_list = arch_list[0] if len(arch_list) == 1 else arch_list
-        detected = flatten([kwargs.get('detected', [])])
+        detected = kwargs.get('detected', self._detected_cc_from_compiler(compiler))
        detected = flatten([detected])
        detected = [self._break_arch_string(a) for a in detected]
        detected = flatten(detected)
        if not set(detected).isdisjoint({'All', 'Common', 'Auto'}):
@ -148,7 +158,7 @@ class CudaModule(ExtensionModule):
                cuda_limit_gpu_architecture    = '7.0'        # noqa: E221
        if version_compare(cuda_version, '>=9.0'):
-            cuda_known_gpu_architectures  += ['Volta', 'Volta+Tegra']              # noqa: E221
+            cuda_known_gpu_architectures  += ['Volta', 'Xavier']                   # noqa: E221
            cuda_common_gpu_architectures += ['7.0', '7.0+PTX']                    # noqa: E221
            cuda_all_gpu_architectures    += ['7.0', '7.0+PTX', '7.2', '7.2+PTX']  # noqa: E221
@ -215,7 +225,7 @@ class CudaModule(ExtensionModule):
                    'Pascal':        (['6.0', '6.1'],      ['6.1']),
                    'Pascal+Tegra':  (['6.2'],             []),
                    'Volta':         (['7.0'],             ['7.0']),
-                    'Volta+Tegra':   (['7.2'],             []),
+                    'Xavier':        (['7.2'],             []),
                    'Turing':        (['7.5'],             ['7.5']),
                }.get(arch_name, (None, None))