diff --git a/docs/markdown/Module-reference.md b/docs/markdown/Module-reference.md index 866c141c5..80e3b8f3a 100644 --- a/docs/markdown/Module-reference.md +++ b/docs/markdown/Module-reference.md @@ -1,4 +1,6 @@ -Meson has a selection of modules to make common requirements easy to use. Modules can be thought of like the standard library of a programming language. Currently Meson provides the following modules. +Meson has a selection of modules to make common requirements easy to use. +Modules can be thought of like the standard library of a programming language. +Currently Meson provides the following modules. * [Gnome](Gnome-module.md) * [i18n](i18n-module.md) @@ -8,3 +10,11 @@ Meson has a selection of modules to make common requirements easy to use. Module * [Python3](Python-3-module.md) * [RPM](RPM-module.md) * [Windows](Windows-module.md) + +In addition there are unstable modules. These are meant for testing new +functionality but note that they do *not* provide a stable API. It can +change in arbitrary ways between releases. The modules might also be removed +without warning in future releases. + + * [SIMD](Simd-module.md) + \ No newline at end of file diff --git a/docs/markdown/Release-notes-for-0.42.0.md b/docs/markdown/Release-notes-for-0.42.0.md index a19db4981..3374d3b01 100644 --- a/docs/markdown/Release-notes-for-0.42.0.md +++ b/docs/markdown/Release-notes-for-0.42.0.md @@ -58,3 +58,10 @@ Rust's [linkage reference][rust-linkage]. Both the address- and undefined behavior sanitizers can now be used simultaneously by passing `-Db_sanitize=address,undefined` to Meson. + +## Unstable SIMD module + +A new experimental module to compile code with many different SIMD +instruction sets and selecting the best one at runtime. This module +is unstable, meaning its API is subject to change in later releases. +It might also be removed altogether. diff --git a/docs/markdown/Simd-module.md b/docs/markdown/Simd-module.md new file mode 100644 index 000000000..0fd1dda70 --- /dev/null +++ b/docs/markdown/Simd-module.md @@ -0,0 +1,70 @@ +# Unstable SIMD module + +This module provides helper functionality to build code with SIMD instructions. +Available since 0.42.0. + +**Note**: this module is unstable. It is only provided as a technology preview. +Its API may change in arbitrary ways between releases or it might be removed +from Meson altogether. + +## Usage + +This module is designed for the use case where you have an algorithm with one +or more SIMD implementation and you choose which one to use at runtime. + +The module provides one method, `check`, which is used like this: + + rval = simd.check('mysimds', + mmx : 'simd_mmx.c', + sse : 'simd_sse.c', + sse2 : 'simd_sse2.c', + sse3 : 'simd_sse3.c', + ssse3 : 'simd_ssse3.c', + sse41 : 'simd_sse41.c', + sse42 : 'simd_sse42.c', + avx : 'simd_avx.c', + avx2 : 'simd_avx2.c', + neon : 'simd_neon.c', + compiler : cc) + +Here the individual files contain the accelerated versions of the functions +in question. The `compiler` keyword argument takes the compiler you are +going to use to compile them. The function returns an array with two values. +The first value is a bunch of libraries that contain the compiled code. Any +SIMD code that the compiler can't compile (for example, Neon instructions on +an x86 machine) are ignored. You should pass this value to the desired target +using `link_with`. The second value is a `configuration_data` object that +contains true for all the values that were supported. For example if the +compiler did support sse2 instructions, then the object would have `HAVE_SSE2` +set to 1. + +Generating code to detect the proper instruction set at runtime is +straightforward. First you create a header with the configuration object and +then a chooser function that looks like this: + + void (*fptr)(type_of_function_here) = NULL; + + #if HAVE_NEON + if(fptr == NULL && neon_available()) { + fptr = neon_accelerated_function; + } + #endif + #if HAVE_AVX2 + if(fptr == NULL && avx2_available()) { + fptr = avx_accelerated_function; + } + #endif + + ... + + if(fptr == NULL) { + fptr = default_function; + } + +Each source file provides two functions, the `xxx_available` function to query +whether the CPU currently in use supports the instruction set and +`xxx_accelerated_function` that is the corresponding accelerated +implementation. + +At the end of this function the function pointer points to the fastest +available implementation and can be invoked to do the computation. diff --git a/docs/sitemap.txt b/docs/sitemap.txt index 9831b93a3..c4df54bc9 100644 --- a/docs/sitemap.txt +++ b/docs/sitemap.txt @@ -27,14 +27,15 @@ index.md Build-options.md Subprojects.md Modules.md + Gnome-module.md i18n-module.md Pkgconfig-module.md Python-3-module.md Qt4-module.md Qt5-module.md RPM-module.md + Simd-module.md Windows-module.md - Gnome-module.md Java.md Vala.md IDE-integration.md diff --git a/mesonbuild/compilers/c.py b/mesonbuild/compilers/c.py index cf9d1ee8f..593366acd 100644 --- a/mesonbuild/compilers/c.py +++ b/mesonbuild/compilers/c.py @@ -25,6 +25,8 @@ from .compilers import ( msvc_buildtype_args, msvc_buildtype_linker_args, msvc_winlibs, + vs32_instruction_set_args, + vs64_instruction_set_args, ClangCompiler, Compiler, CompilerArgs, @@ -810,7 +812,7 @@ class VisualStudioCCompiler(CCompiler): std_warn_args = ['/W3'] std_opt_args = ['/O2'] - def __init__(self, exelist, version, is_cross, exe_wrap): + def __init__(self, exelist, version, is_cross, exe_wrap, is_64): CCompiler.__init__(self, exelist, version, is_cross, exe_wrap) self.id = 'msvc' # /showIncludes is needed for build dependency tracking in Ninja @@ -820,6 +822,7 @@ class VisualStudioCCompiler(CCompiler): '2': ['/W3'], '3': ['/W4']} self.base_options = ['b_pch'] # FIXME add lto, pgo and the like + self.is_64 = is_64 # Override CCompiler.get_always_args def get_always_args(self): @@ -1005,3 +1008,15 @@ class VisualStudioCCompiler(CCompiler): if not isinstance(args, list): args = [args] return ['/WHOLEARCHIVE:' + x for x in args] + + def get_instruction_set_args(self, instruction_set): + if self.is_64: + return vs64_instruction_set_args.get(instruction_set, None) + if self.version.split('.')[0] == '16' and instruction_set == 'avx': + # VS documentation says that this exists and should work, but + # it does not. The headers do not contain AVX intrinsics + # and the can not be called. + return None + return vs32_instruction_set_args.get(instruction_set, None) + + diff --git a/mesonbuild/compilers/compilers.py b/mesonbuild/compilers/compilers.py index a8ec5e362..0be390847 100644 --- a/mesonbuild/compilers/compilers.py +++ b/mesonbuild/compilers/compilers.py @@ -228,6 +228,43 @@ base_options = {'b_pch': coredata.UserBooleanOption('b_pch', 'Use precompiled he True), } +gnulike_instruction_set_args = {'mmx': ['-mmmx'], + 'sse': ['-msse'], + 'sse2': ['-msse2'], + 'sse3': ['-msse3'], + 'ssse3': ['-mssse3'], + 'sse41': ['-msse4.1'], + 'sse42': ['-msse4.2'], + 'avx': ['-mavx'], + 'avx2': ['-mavx2'], + 'neon': ['-mfpu=neon'], + } + +vs32_instruction_set_args = {'mmx': ['/arch:SSE'], # There does not seem to be a flag just for MMX + 'sse': ['/arch:SSE'], + 'sse2': ['/arch:SSE2'], + 'sse3': ['/arch:AVX'], # VS leaped from SSE2 directly to AVX. + 'sse41': ['/arch:AVX'], + 'sse42': ['/arch:AVX'], + 'avx': ['/arch:AVX'], + 'avx2': ['/arch:AVX2'], + 'neon': None, +} + +# The 64 bit compiler defaults to /arch:avx. +vs64_instruction_set_args = {'mmx': ['/arch:AVX'], + 'sse': ['/arch:AVX'], + 'sse2': ['/arch:AVX'], + 'sse3': ['/arch:AVX'], + 'ssse3': ['/arch:AVX'], + 'sse41': ['/arch:AVX'], + 'sse42': ['/arch:AVX'], + 'avx': ['/arch:AVX'], + 'avx2': ['/arch:AVX2'], + 'neon': None, + } + + def sanitizer_compile_args(value): if value == 'none': return [] @@ -755,6 +792,12 @@ class Compiler: return [] raise EnvironmentException('Language %s does not support linking whole archives.' % self.get_display_language()) + # Compiler arguments needed to enable the given instruction set. + # May be [] meaning nothing needed or None meaning the given set + # is not supported. + def get_instruction_set_args(self, instruction_set): + return None + def build_unix_rpath_args(self, build_dir, from_dir, rpath_paths, install_rpath): if not rpath_paths and not install_rpath: return [] @@ -933,6 +976,10 @@ class GnuCompiler: return ['-mwindows'] return [] + def get_instruction_set_args(self, instruction_set): + return gnulike_instruction_set_args.get(instruction_set, None) + + class ClangCompiler: def __init__(self, clang_type): self.id = 'clang' @@ -983,7 +1030,7 @@ class ClangCompiler: def has_multi_arguments(self, args, env): return super().has_multi_arguments( - ['-Werror=unknown-warning-option'] + args, + ['-Werror=unknown-warning-option', '-Werror=unused-command-line-argument'] + args, env) def has_function(self, funcname, prefix, env, extra_args=None, dependencies=None): @@ -1010,6 +1057,9 @@ class ClangCompiler: return result return ['-Wl,--whole-archive'] + args + ['-Wl,--no-whole-archive'] + def get_instruction_set_args(self, instruction_set): + return gnulike_instruction_set_args.get(instruction_set, None) + # Tested on linux for ICC 14.0.3, 15.0.6, 16.0.4, 17.0.1 class IntelCompiler: diff --git a/mesonbuild/compilers/cpp.py b/mesonbuild/compilers/cpp.py index 01525b06b..a8fc8a353 100644 --- a/mesonbuild/compilers/cpp.py +++ b/mesonbuild/compilers/cpp.py @@ -173,10 +173,10 @@ class IntelCPPCompiler(IntelCompiler, CPPCompiler): class VisualStudioCPPCompiler(VisualStudioCCompiler, CPPCompiler): - def __init__(self, exelist, version, is_cross, exe_wrap): + def __init__(self, exelist, version, is_cross, exe_wrap, is_64): self.language = 'cpp' CPPCompiler.__init__(self, exelist, version, is_cross, exe_wrap) - VisualStudioCCompiler.__init__(self, exelist, version, is_cross, exe_wrap) + VisualStudioCCompiler.__init__(self, exelist, version, is_cross, exe_wrap, is_64) self.base_options = ['b_pch'] # FIXME add lto, pgo and the like def get_options(self): diff --git a/mesonbuild/environment.py b/mesonbuild/environment.py index a6da3f917..dd9f56ed5 100644 --- a/mesonbuild/environment.py +++ b/mesonbuild/environment.py @@ -534,8 +534,12 @@ class Environment: # Visual Studio prints version number to stderr but # everything else to stdout. Why? Lord only knows. version = search_version(err) + if not err or not err.split('\n')[0]: + m = 'Failed to detect MSVC compiler arch: stderr was\n{!r}' + raise EnvironmentException(m.format(err)) + is_64 = err.split('\n')[0].endswith(' x64') cls = VisualStudioCCompiler if lang == 'c' else VisualStudioCPPCompiler - return cls(compiler, version, is_cross, exe_wrap) + return cls(compiler, version, is_cross, exe_wrap, is_64) if '(ICC)' in out: # TODO: add microsoft add check OSX inteltype = ICC_STANDARD diff --git a/mesonbuild/interpreter.py b/mesonbuild/interpreter.py index 63cdf9ea0..359dd17f5 100644 --- a/mesonbuild/interpreter.py +++ b/mesonbuild/interpreter.py @@ -161,6 +161,7 @@ class ConfigurationDataHolder(MutableInterpreterObject): 'set_quoted': self.set_quoted_method, 'has': self.has_method, 'get': self.get_method, + 'merge_from': self.merge_from_method, }) def is_used(self): @@ -221,6 +222,16 @@ class ConfigurationDataHolder(MutableInterpreterObject): def keys(self): return self.held_object.values.keys() + def merge_from_method(self, args, kwargs): + if len(args) != 1: + raise InterpreterException('Merge_from takes one positional argument.') + from_object = args[0] + if not isinstance(from_object, ConfigurationDataHolder): + raise InterpreterException('Merge_from argument must be a configuration data object.') + from_object = from_object.held_object + for k, v in from_object.values.items(): + self.held_object.values[k] = v + # Interpreter objects can not be pickled so we must have # these wrappers. @@ -1479,6 +1490,10 @@ class Interpreter(InterpreterBase): if len(args) != 1: raise InvalidCode('Import takes one argument.') modname = args[0] + if modname.startswith('unstable-'): + plainname = modname.split('-', 1)[1] + mlog.warning('Module %s has no backwards or forwards compatibility and might not exist in future releases.' % modname) + modname = 'unstable_' + plainname if modname not in self.environment.coredata.modules: try: module = importlib.import_module('mesonbuild.modules.' + modname) diff --git a/mesonbuild/modules/unstable_simd.py b/mesonbuild/modules/unstable_simd.py new file mode 100644 index 000000000..4aebc02f6 --- /dev/null +++ b/mesonbuild/modules/unstable_simd.py @@ -0,0 +1,72 @@ +# Copyright 2017 The Meson development team + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .. import mesonlib, compilers, mlog + +from . import ExtensionModule + +class SimdModule(ExtensionModule): + + def __init__(self): + super().__init__() + self.snippets.add('check') + # FIXME add Altivec and AVX512. + self.isets = ('mmx', + 'sse', + 'sse2', + 'sse3', + 'ssse3', + 'sse41', + 'sse42', + 'avx', + 'avx2', + 'neon', + ) + + def check(self, interpreter, state, args, kwargs): + result = [] + if len(args) != 1: + raise mesonlib.MesonException('Check requires one argument, a name prefix for checks.') + prefix = args[0] + if not isinstance(prefix, str): + raise mesonlib.MesonException('Argument must be a string.') + if 'compiler' not in kwargs: + raise mesonlib.MesonException('Must specify compiler keyword') + compiler = kwargs['compiler'].compiler + if not isinstance(compiler, compilers.compilers.Compiler): + raise mesonlib.MesonException('Compiler argument must be a compiler object.') + cdata = interpreter.func_configuration_data(None, [], {}) + conf = cdata.held_object + for iset in self.isets: + if iset not in kwargs: + continue + iset_fname = kwargs[iset] # Migth also be an array or Files. static_library will validate. + args = compiler.get_instruction_set_args(iset) + if args is None: + mlog.log('Compiler supports %s:' % iset, mlog.red('NO')) + continue + if len(args) > 0: + if not compiler.has_multi_arguments(args, state.environment): + mlog.log('Compiler supports %s:' % iset, mlog.red('NO')) + continue + mlog.log('Compiler supports %s:' % iset, mlog.green('YES')) + conf.values['HAVE_' + iset.upper()] = ('1', 'Compiler supports %s.' % iset) + libname = prefix + '_' + iset + lib_kwargs = {'sources': iset_fname, + compiler.get_language() + '_args': args} + result.append(interpreter.func_static_lib(None, [libname], lib_kwargs)) + return [result, cdata] + +def initialize(): + return SimdModule() diff --git a/run_unittests.py b/run_unittests.py index 6a5030206..cc034c32d 100755 --- a/run_unittests.py +++ b/run_unittests.py @@ -948,6 +948,7 @@ class AllPlatformTests(BasePlatformTests): # Detect with evar and do sanity checks on that if evar in os.environ: ecc = getattr(env, 'detect_{}_compiler'.format(lang))(False) + self.assertTrue(ecc.version) elinker = env.detect_static_linker(ecc) # Pop it so we don't use it for the next detection evalue = os.environ.pop(evar) @@ -971,6 +972,7 @@ class AllPlatformTests(BasePlatformTests): self.assertEqual(ecc.get_exelist(), shlex.split(evalue)) # Do auto-detection of compiler based on platform, PATH, etc. cc = getattr(env, 'detect_{}_compiler'.format(lang))(False) + self.assertTrue(cc.version) linker = env.detect_static_linker(cc) # Check compiler type if isinstance(cc, gnu): @@ -1004,11 +1006,18 @@ class AllPlatformTests(BasePlatformTests): self.assertTrue(is_windows()) self.assertIsInstance(linker, lib) self.assertEqual(cc.id, 'msvc') + self.assertTrue(hasattr(cc, 'is_64')) + # If we're in the appveyor CI, we know what the compiler will be + if 'arch' in os.environ: + if os.environ['arch'] == 'x64': + self.assertTrue(cc.is_64) + else: + self.assertFalse(cc.is_64) # Set evar ourselves to a wrapper script that just calls the same # exelist + some argument. This is meant to test that setting # something like `ccache gcc -pipe` or `distcc ccache gcc` works. wrapper = os.path.join(testdir, 'compiler wrapper.py') - wrappercc = [sys.executable, wrapper] + cc.get_exelist() + cc.get_always_args() + wrappercc = [sys.executable, wrapper] + cc.get_exelist() + ['-DSOME_ARG'] wrappercc_s = '' for w in wrappercc: wrappercc_s += shlex.quote(w) + ' ' @@ -1027,6 +1036,10 @@ class AllPlatformTests(BasePlatformTests): # Ensure that the exelist is correct self.assertEqual(wcc.get_exelist(), wrappercc) self.assertEqual(wlinker.get_exelist(), wrapperlinker) + # Ensure that the version detection worked correctly + self.assertEqual(cc.version, wcc.version) + if hasattr(cc, 'is_64'): + self.assertEqual(cc.is_64, wcc.is_64) def test_always_prefer_c_compiler_for_asm(self): testdir = os.path.join(self.common_test_dir, '141 c cpp and asm') diff --git a/test cases/common/155 simd/fallback.c b/test cases/common/155 simd/fallback.c new file mode 100644 index 000000000..ab435f433 --- /dev/null +++ b/test cases/common/155 simd/fallback.c @@ -0,0 +1,8 @@ +#include + +void increment_fallback(float arr[4]) { + int i; + for(i=0; i<4; i++) { + arr[i]++; + } +} diff --git a/test cases/common/155 simd/meson.build b/test cases/common/155 simd/meson.build new file mode 100644 index 000000000..9da165185 --- /dev/null +++ b/test cases/common/155 simd/meson.build @@ -0,0 +1,43 @@ +project('simd', 'c') + +simd = import('unstable-simd') + +cc = meson.get_compiler('c') + +cdata = configuration_data() + +if not meson.is_cross_build() and host_machine.cpu_family() == 'arm' and cc.get_id() == 'clang' + message('Adding -march=armv7 because assuming that this build happens on Raspbian.') + message('Its Clang seems to be misconfigured and does not support NEON by default.') + add_project_arguments('-march=armv7', language : 'c') +endif + +if cc.get_id() == 'msvc' and cc.version().version_compare('<17') + error('MESON_SKIP_TEST VS2010 produces broken binaries on x86.') +endif + +# FIXME add [a, b] = function() +rval = simd.check('mysimds', + mmx : 'simd_mmx.c', + sse : 'simd_sse.c', + sse2 : 'simd_sse2.c', + sse3 : 'simd_sse3.c', + ssse3 : 'simd_ssse3.c', + sse41 : 'simd_sse41.c', + sse42 : 'simd_sse42.c', + avx : 'simd_avx.c', + avx2 : 'simd_avx2.c', + neon : 'simd_neon.c', + compiler : cc) + +simdlibs = rval[0] +cdata.merge_from(rval[1]) + +configure_file(output : 'simdconfig.h', + configuration : cdata) + +p = executable('simdtest', 'simdchecker.c', 'fallback.c', + link_with : simdlibs) + +test('simdtest', p) + diff --git a/test cases/common/155 simd/simd_avx.c b/test cases/common/155 simd/simd_avx.c new file mode 100644 index 000000000..989620ba3 --- /dev/null +++ b/test cases/common/155 simd/simd_avx.c @@ -0,0 +1,43 @@ +#include +#include +#include + +#ifdef _MSC_VER +#include +int avx_available() { + return 1; +} +#else +#include +#include + +#ifdef __APPLE__ +/* + * Apple ships a broken __builtin_cpu_supports and + * some machines in the CI farm seem to be too + * old to have AVX so just always return 0 here. + */ +int avx_available() { return 0; } +#else + +int avx_available() { + return __builtin_cpu_supports("avx"); +} +#endif +#endif + +void increment_avx(float arr[4]) { + double darr[4]; + darr[0] = arr[0]; + darr[1] = arr[1]; + darr[2] = arr[2]; + darr[3] = arr[3]; + __m256d val = _mm256_loadu_pd(darr); + __m256d one = _mm256_set1_pd(1.0); + __m256d result = _mm256_add_pd(val, one); + _mm256_storeu_pd(darr, result); + arr[0] = (float)darr[0]; + arr[1] = (float)darr[1]; + arr[2] = (float)darr[2]; + arr[3] = (float)darr[3]; +} diff --git a/test cases/common/155 simd/simd_avx2.c b/test cases/common/155 simd/simd_avx2.c new file mode 100644 index 000000000..15297eb2b --- /dev/null +++ b/test cases/common/155 simd/simd_avx2.c @@ -0,0 +1,42 @@ +#include +#include +#include + +/* + * FIXME add proper runtime detection for VS. + */ + +#ifdef _MSC_VER +#include +int avx2_available() { + return 0; +} +#else +#include +#include + +#if defined(__APPLE__) +int avx2_available() { return 0; } +#else +int avx2_available() { + return __builtin_cpu_supports("avx2"); +} +#endif +#endif + +void increment_avx2(float arr[4]) { + double darr[4]; + darr[0] = arr[0]; + darr[1] = arr[1]; + darr[2] = arr[2]; + darr[3] = arr[3]; + __m256d val = _mm256_loadu_pd(darr); + __m256d one = _mm256_set1_pd(1.0); + __m256d result = _mm256_add_pd(val, one); + _mm256_storeu_pd(darr, result); + one = _mm256_permute4x64_pd(one, 66); /* A no-op, just here to use AVX2. */ + arr[0] = (float)darr[0]; + arr[1] = (float)darr[1]; + arr[2] = (float)darr[2]; + arr[3] = (float)darr[3]; +} diff --git a/test cases/common/155 simd/simd_mmx.c b/test cases/common/155 simd/simd_mmx.c new file mode 100644 index 000000000..731abd14a --- /dev/null +++ b/test cases/common/155 simd/simd_mmx.c @@ -0,0 +1,63 @@ +#include +#include + +#include + +#ifdef _MSC_VER +#include +int mmx_available() { + return 1; +} +/* Contrary to MSDN documentation, MMX intrinsics + * just plain don't work. + */ +void increment_mmx(float arr[4]) { + arr[0]++; + arr[1]++; + arr[2]++; + arr[3]++; +} +#elif defined(__MINGW32__) +int mmx_available() { + return 1; +} +/* MinGW does not seem to ship with MMX or it is broken. + */ +void increment_mmx(float arr[4]) { + arr[0]++; + arr[1]++; + arr[2]++; + arr[3]++; +} +#else +#include +#include + +#if defined(__APPLE__) +int mmx_available() { return 1; } +#else +int mmx_available() { + return __builtin_cpu_supports("mmx"); +} +#endif +void increment_mmx(float arr[4]) { + /* Super ugly but we know that values in arr are always small + * enough to fit in int16; + */ + int i; + __m64 packed = _mm_set_pi16(arr[3], arr[2], arr[1], arr[0]); + __m64 incr = _mm_set1_pi16(1); + __m64 result = _mm_add_pi16(packed, incr); + /* Should be + * int64_t unpacker = _m_to_int64(result); + * but it does not exist on 32 bit platforms for some reason. + */ + int64_t unpacker = (int64_t)(result); + _mm_empty(); + for(i=0; i<4; i++) { + arr[i] = (float)(unpacker & ((1<<16)-1)); + unpacker >>= 16; + } +} + +#endif diff --git a/test cases/common/155 simd/simd_neon.c b/test cases/common/155 simd/simd_neon.c new file mode 100644 index 000000000..20820992b --- /dev/null +++ b/test cases/common/155 simd/simd_neon.c @@ -0,0 +1,20 @@ +#include +#include + +#include +#include + +int neon_available() { + return 1; /* Incorrect, but I don't know how to check this properly. */ +} + +void increment_neon(float arr[4]) { + float32x2_t a1, a2, one; + a1 = vld1_f32(arr); + a2 = vld1_f32(&arr[2]); + one = vdup_n_f32(1.0); + a1 = vadd_f32(a1, one); + a2 = vadd_f32(a2, one); + vst1_f32(arr, a1); + vst1_f32(&arr[2], a2); +} diff --git a/test cases/common/155 simd/simd_sse.c b/test cases/common/155 simd/simd_sse.c new file mode 100644 index 000000000..3c9fe622e --- /dev/null +++ b/test cases/common/155 simd/simd_sse.c @@ -0,0 +1,29 @@ +#include +#include + +#ifdef _MSC_VER +#include +int sse_available() { + return 1; +} +#else + +#include +#include +#include + +#if defined(__APPLE__) +int sse_available() { return 1; } +#else +int sse_available() { + return __builtin_cpu_supports("sse"); +} +#endif +#endif + +void increment_sse(float arr[4]) { + __m128 val = _mm_load_ps(arr); + __m128 one = _mm_set_ps1(1.0); + __m128 result = _mm_add_ps(val, one); + _mm_storeu_ps(arr, result); +} diff --git a/test cases/common/155 simd/simd_sse2.c b/test cases/common/155 simd/simd_sse2.c new file mode 100644 index 000000000..02745337b --- /dev/null +++ b/test cases/common/155 simd/simd_sse2.c @@ -0,0 +1,37 @@ +#include +#include +#include + +#ifdef _MSC_VER +int sse2_available() { + return 1; +} + +#else +#include +#include + +#if defined(__APPLE__) +int sse2_available() { return 1; } +#else +int sse2_available() { + return __builtin_cpu_supports("sse2"); +} +#endif +#endif + +void increment_sse2(float arr[4]) { + double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} + diff --git a/test cases/common/155 simd/simd_sse3.c b/test cases/common/155 simd/simd_sse3.c new file mode 100644 index 000000000..e97d10285 --- /dev/null +++ b/test cases/common/155 simd/simd_sse3.c @@ -0,0 +1,38 @@ +#include +#include + +#ifdef _MSC_VER +#include +int sse3_available() { + return 1; +} +#else + +#include +#include +#include + +#if defined(__APPLE__) +int sse3_available() { return 1; } +#else +int sse3_available() { + return __builtin_cpu_supports("sse3"); +} +#endif +#endif + +void increment_sse3(float arr[4]) { + double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + result = _mm_hadd_pd(val1, val2); /* This does nothing. Only here so we use an SSE3 instruction. */ + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} diff --git a/test cases/common/155 simd/simd_sse41.c b/test cases/common/155 simd/simd_sse41.c new file mode 100644 index 000000000..0308c7e49 --- /dev/null +++ b/test cases/common/155 simd/simd_sse41.c @@ -0,0 +1,40 @@ +#include +#include + +#include + +#ifdef _MSC_VER +#include + +int sse41_available() { + return 1; +} + +#else +#include +#include + +#if defined(__APPLE__) +int sse41_available() { return 1; } +#else +int sse41_available() { + return __builtin_cpu_supports("sse4.1"); +} +#endif +#endif + +void increment_sse41(float arr[4]) { + double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + result = _mm_ceil_pd(result); /* A no-op, only here to use a SSE4.1 intrinsic. */ + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} diff --git a/test cases/common/155 simd/simd_sse42.c b/test cases/common/155 simd/simd_sse42.c new file mode 100644 index 000000000..137ffc441 --- /dev/null +++ b/test cases/common/155 simd/simd_sse42.c @@ -0,0 +1,43 @@ +#include +#include +#include + +#ifdef _MSC_VER +#include + +int sse42_available() { + return 1; +} + +#else + +#include +#include + +#ifdef __APPLE__ +int sse42_available() { + return 1; +} +#else +int sse42_available() { + return __builtin_cpu_supports("sse4.2"); +} +#endif + +#endif + +void increment_sse42(float arr[4]) { + double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + _mm_crc32_u32(42, 99); /* A no-op, only here to use an SSE4.2 instruction. */ + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} diff --git a/test cases/common/155 simd/simd_ssse3.c b/test cases/common/155 simd/simd_ssse3.c new file mode 100644 index 000000000..ab4dff4f8 --- /dev/null +++ b/test cases/common/155 simd/simd_ssse3.c @@ -0,0 +1,48 @@ +#include +#include + +#include +#include + +#ifdef _MSC_VER +#include + +int ssse3_available() { + return 1; +} + +#else + +#include +#include + +int ssse3_available() { +#ifdef __APPLE__ + return 1; +#elif defined(__clang__) + /* https://github.com/numpy/numpy/issues/8130 */ + return __builtin_cpu_supports("sse4.1"); +#else + return __builtin_cpu_supports("ssse3"); +#endif +} + +#endif + +void increment_ssse3(float arr[4]) { + double darr[4]; + __m128d val1 = _mm_set_pd(arr[0], arr[1]); + __m128d val2 = _mm_set_pd(arr[2], arr[3]); + __m128d one = _mm_set_pd(1.0, 1.0); + __m128d result = _mm_add_pd(val1, one); + __m128i tmp1, tmp2; + tmp1 = tmp2 = _mm_set1_epi16(0); + _mm_store_pd(darr, result); + result = _mm_add_pd(val2, one); + _mm_store_pd(&darr[2], result); + tmp1 = _mm_hadd_epi32(tmp1, tmp2); /* This does nothing. Only here so we use an SSSE3 instruction. */ + arr[0] = (float)darr[1]; + arr[1] = (float)darr[0]; + arr[2] = (float)darr[3]; + arr[3] = (float)darr[2]; +} diff --git a/test cases/common/155 simd/simdchecker.c b/test cases/common/155 simd/simdchecker.c new file mode 100644 index 000000000..222fbf3e3 --- /dev/null +++ b/test cases/common/155 simd/simdchecker.c @@ -0,0 +1,93 @@ +#include +#include + +/* + * A function that checks at runtime which simd accelerations are + * available and calls the best one. Falls + * back to plain C implementation if SIMD is not available. + */ + +int main(int argc, char **argv) { + float four[4] = {2.0, 3.0, 4.0, 5.0}; + const float expected[4] = {3.0, 4.0, 5.0, 6.0}; + void (*fptr)(float[4]) = NULL; + const char *type; + int i; + +/* Add here. The first matched one is used so put "better" instruction + * sets at the top. + */ +#if HAVE_NEON + if(fptr == NULL && neon_available()) { + fptr = increment_neon; + type = "NEON"; + } +#endif +#if HAVE_AVX2 + if(fptr == NULL && avx2_available()) { + fptr = increment_avx2; + type = "AVX2"; + } +#endif +#if HAVE_AVX + if(fptr == NULL && avx_available()) { + fptr = increment_avx; + type = "AVX"; + } +#endif +#if HAVE_SSE42 + if(fptr == NULL && sse42_available()) { + fptr = increment_sse42; + type = "SSE42"; + } +#endif +#if HAVE_SSE41 + if(fptr == NULL && sse41_available()) { + fptr = increment_sse41; + type = "SSE41"; + } +#endif +#if HAVE_SSSE3 + if(fptr == NULL && ssse3_available()) { + fptr = increment_ssse3; + type = "SSSE3"; + } +#endif +#if HAVE_SSE3 + if(fptr == NULL && sse3_available()) { + fptr = increment_sse3; + type = "SSE3"; + } +#endif +#if HAVE_SSE2 + if(fptr == NULL && sse2_available()) { + fptr = increment_sse2; + type = "SSE2"; + } +#endif +#if HAVE_SSE + if(fptr == NULL && sse_available()) { + fptr = increment_sse; + type = "SSE"; + } +#endif +#if HAVE_MMX + if(fptr == NULL && mmx_available()) { + fptr = increment_mmx; + type = "MMX"; + } +#endif + if(fptr == NULL) { + fptr = increment_fallback; + type = "fallback"; + } + printf("Using %s.\n", type); + fptr(four); + for(i=0; i<4; i++) { + if(four[i] != expected[i]) { + printf("Increment function failed, got %f expected %f.\n", four[i], expected[i]); + return 1; + } + } + return 0; +} diff --git a/test cases/common/155 simd/simdfuncs.h b/test cases/common/155 simd/simdfuncs.h new file mode 100644 index 000000000..dfb056068 --- /dev/null +++ b/test cases/common/155 simd/simdfuncs.h @@ -0,0 +1,67 @@ +#pragma once + +#include + +/* Yes, I do know that arr[4] decays into a pointer + * as a function argument. Don't do this in real code + * but for this test it is ok. + */ + +void increment_fallback(float arr[4]); + +#if HAVE_MMX +int mmx_available(); +void increment_mmx(float arr[4]); +#endif + +#if HAVE_SSE +int sse_available(); +void increment_sse(float arr[4]); +#endif + +#if HAVE_SSE2 +int sse2_available(); +void increment_sse2(float arr[4]); +#endif + +#if HAVE_SSE3 +int sse3_available(); +void increment_sse3(float arr[4]); +#endif + +#if HAVE_SSSE3 +int ssse3_available(); +void increment_ssse3(float arr[4]); +#endif + +#if HAVE_SSE41 +int sse41_available(); +void increment_sse41(float arr[4]); +#endif + +#if HAVE_SSE42 +int sse42_available(); +void increment_sse42(float arr[4]); +#endif + +#if HAVE_AVX +int avx_available(); +void increment_avx(float arr[4]); +#endif + +#if HAVE_AVX2 +int avx2_available(); +void increment_avx2(float arr[4]); +#endif + +#if HAVE_NEON +int neon_available(); +void increment_neon(float arr[4]); +#endif + +#if HAVE_ALTIVEC +int altivec_available(); +void increment_altivec(float arr[4]); +#endif + +/* And so on. */