diff --git a/CMakeLists.txt b/CMakeLists.txt index 7dccb9e838..8e9bd0bbd4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -298,6 +298,7 @@ OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add OCV_OPTION(ENABLE_COVERAGE "Enable coverage collection with GCov" OFF IF CMAKE_COMPILER_IS_GNUCXX ) OCV_OPTION(ENABLE_OMIT_FRAME_POINTER "Enable -fomit-frame-pointer for GCC" ON IF CMAKE_COMPILER_IS_GNUCXX AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX) ) OCV_OPTION(ENABLE_POWERPC "Enable PowerPC for GCC" ON IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) ) +OCV_OPTION(ENABLE_VSX "Enable POWER8 and above VSX (64-bit little-endian)" ON IF (CMAKE_COMPILER_IS_GNUCXX AND PPC64LE) ) OCV_OPTION(ENABLE_FAST_MATH "Enable -ffast-math (not recommended for GCC 4.6.x)" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_NEON "Enable NEON instructions" (NEON OR ANDROID_ARM_NEON OR AARCH64) IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR AARCH64 OR IOS) ) OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR AARCH64 OR IOS) ) diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 45a536bb28..fea13cefa6 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -28,6 +28,7 @@ set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3") # without AVX512 list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16) +list(APPEND CPU_ALL_OPTIMIZATIONS VSX) list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS) ocv_update(CPU_VFPV3_FEATURE_ALIAS "") @@ -79,6 +80,7 @@ ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON) ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF) ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF) +ocv_optimization_process_obsolete_option(ENABLE_VSX VSX OFF) macro(ocv_is_optimization_in_list resultvar check_opt) set(__checked "") @@ -266,6 +268,15 @@ elseif(ARM OR AARCH64) ocv_update(CPU_FP16_IMPLIES "NEON") set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}") endif() +elseif(PPC64LE) + ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX") + ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp") + + if(CMAKE_COMPILER_IS_CLANGCXX AND (NOT ${CMAKE_CXX_COMPILER} MATCHES "xlc")) + ocv_update(CPU_VSX_FLAGS_ON "-mvsx -maltivec") + else() + ocv_update(CPU_VSX_FLAGS_ON "-mcpu=power8") + endif() endif() # Helper values for cmake-gui diff --git a/cmake/OpenCVDetectCXXCompiler.cmake b/cmake/OpenCVDetectCXXCompiler.cmake index 7b0d4c30bf..cfb613bc72 100644 --- a/cmake/OpenCVDetectCXXCompiler.cmake +++ b/cmake/OpenCVDetectCXXCompiler.cmake @@ -72,6 +72,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") set(ARM 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") set(AARCH64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^ppc64le.*|PPC64LE.*") + set(PPC64LE 1) endif() # Workaround for 32-bit operating systems on 64-bit x86_64 processor diff --git a/cmake/OpenCVPackaging.cmake b/cmake/OpenCVPackaging.cmake index 21a82a478e..363e46b7ee 100644 --- a/cmake/OpenCVPackaging.cmake +++ b/cmake/OpenCVPackaging.cmake @@ -31,6 +31,9 @@ elseif(ARM) elseif(AARCH64) set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "arm64") set(CPACK_RPM_PACKAGE_ARCHITECTURE "aarch64") +elseif(PPC64LE) + set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "ppc64el") + set(CPACK_RPM_PACKAGE_ARCHITECTURE "ppc64le") else() set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR}) set(CPACK_RPM_PACKAGE_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR}) @@ -164,4 +167,4 @@ endif(NOT OPENCV_CUSTOM_PACKAGE_INFO) include(CPack) -ENDif(EXISTS "${CMAKE_ROOT}/Modules/CPack.cmake") \ No newline at end of file +ENDif(EXISTS "${CMAKE_ROOT}/Modules/CPack.cmake") diff --git a/cmake/checks/cpu_vsx.cpp b/cmake/checks/cpu_vsx.cpp new file mode 100644 index 0000000000..6d744825b6 --- /dev/null +++ b/cmake/checks/cpu_vsx.cpp @@ -0,0 +1,12 @@ +# if defined(__VSX__) +# include +# else +# error "VSX is not supported" +# endif + +int main() +{ + __vector float testF = vec_splats(0.f); + testF = vec_madd(testF, testF, testF); + return 0; +} diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index 62d767fcf5..113954d1e9 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -740,5 +740,6 @@ CV_EXPORTS_W void setUseIPP_NE(bool flag); } // cv #include "opencv2/core/neon_utils.hpp" +#include "opencv2/core/vsx_utils.hpp" #endif //OPENCV_CORE_BASE_HPP diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h index 5de35bcedf..5261a41439 100644 --- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h +++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h @@ -99,6 +99,14 @@ # include #endif +#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__) +# include +# undef vector +# undef pixel +# undef bool +# define CV_VSX 1 +#endif + #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__ #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX @@ -135,6 +143,12 @@ struct VZeroUpperGuard { #elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__)) # include # define CV_NEON 1 +#elif defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__) +# include +# undef vector +# undef pixel +# undef bool +# define CV_VSX 1 #endif #endif // !__OPENCV_BUILD && !__CUDACC (Compatibility code) @@ -208,3 +222,7 @@ struct VZeroUpperGuard { #ifndef CV_NEON # define CV_NEON 0 #endif + +#ifndef CV_VSX +# define CV_VSX 0 +#endif diff --git a/modules/core/include/opencv2/core/cv_cpu_helper.h b/modules/core/include/opencv2/core/cv_cpu_helper.h index 6eaed9e661..66a473f1e1 100644 --- a/modules/core/include/opencv2/core/cv_cpu_helper.h +++ b/modules/core/include/opencv2/core/cv_cpu_helper.h @@ -180,5 +180,20 @@ #endif #define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) +#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX +# define CV_TRY_VSX 1 +# define CV_CPU_HAS_SUPPORT_VSX 1 +# define CV_CPU_CALL_VSX(fn, args) return (opt_VSX::fn args) +#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX +# define CV_TRY_VSX 1 +# define CV_CPU_HAS_SUPPORT_VSX (cv::checkHardwareSupport(CV_CPU_VSX)) +# define CV_CPU_CALL_VSX(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args) +#else +# define CV_TRY_VSX 0 +# define CV_CPU_HAS_SUPPORT_VSX 0 +# define CV_CPU_CALL_VSX(fn, args) +#endif +#define __CV_CPU_DISPATCH_CHAIN_VSX(fn, args, mode, ...) CV_CPU_CALL_VSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__)) + #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args) #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */ diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 70bbf93104..b513b44c15 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -153,6 +153,8 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard #define CV_CPU_NEON 100 +#define CV_CPU_VSX 200 + // when adding to this list remember to update the following enum #define CV_HARDWARE_MAX_FEATURE 255 @@ -182,7 +184,9 @@ enum CpuFeatures { CPU_AVX_512VBMI = 20, CPU_AVX_512VL = 21, - CPU_NEON = 100 + CPU_NEON = 100, + + CPU_VSX = 200 }; diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index d6dedc1d1a..26fd9185b3 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -308,6 +308,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END #ifdef CV_DOXYGEN # undef CV_SSE2 # undef CV_NEON +# undef CV_VSX #endif #if CV_SSE2 @@ -318,6 +319,10 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END #include "opencv2/core/hal/intrin_neon.hpp" +#elif CV_VSX + +#include "opencv2/core/hal/intrin_vsx.hpp" + #else #include "opencv2/core/hal/intrin_cpp.hpp" diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp new file mode 100644 index 0000000000..3d15945de7 --- /dev/null +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -0,0 +1,927 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef OPENCV_HAL_VSX_HPP +#define OPENCV_HAL_VSX_HPP + +#include +#include "opencv2/core/utility.hpp" + +#define CV_SIMD128 1 +#define CV_SIMD128_64F 1 + +/** + * todo: supporting half precision for power9 + * convert instractions xvcvhpsp, xvcvsphp +**/ + +namespace cv +{ + +//! @cond IGNORED + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +///////// Types //////////// + +struct v_uint8x16 +{ + typedef uchar lane_type; + enum { nlanes = 16 }; + vec_uchar16 val; + + explicit v_uint8x16(const vec_uchar16& v) : val(v) + {} + v_uint8x16() : val(vec_uchar16_z) + {} + v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v)) + {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + : val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)) + {} + uchar get0() const + { return vec_extract(val, 0); } +}; + +struct v_int8x16 +{ + typedef schar lane_type; + enum { nlanes = 16 }; + vec_char16 val; + + explicit v_int8x16(const vec_char16& v) : val(v) + {} + v_int8x16() : val(vec_char16_z) + {} + v_int8x16(vec_bchar16 v) : val(vec_char16_c(v)) + {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + : val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)) + {} + schar get0() const + { return vec_extract(val, 0); } +}; + +struct v_uint16x8 +{ + typedef ushort lane_type; + enum { nlanes = 8 }; + vec_ushort8 val; + + explicit v_uint16x8(const vec_ushort8& v) : val(v) + {} + v_uint16x8() : val(vec_ushort8_z) + {} + v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v)) + {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + : val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7)) + {} + ushort get0() const + { return vec_extract(val, 0); } +}; + +struct v_int16x8 +{ + typedef short lane_type; + enum { nlanes = 8 }; + vec_short8 val; + + explicit v_int16x8(const vec_short8& v) : val(v) + {} + v_int16x8() : val(vec_short8_z) + {} + v_int16x8(vec_bshort8 v) : val(vec_short8_c(v)) + {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + : val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7)) + {} + short get0() const + { return vec_extract(val, 0); } +}; + +struct v_uint32x4 +{ + typedef unsigned lane_type; + enum { nlanes = 4 }; + vec_uint4 val; + + explicit v_uint32x4(const vec_uint4& v) : val(v) + {} + v_uint32x4() : val(vec_uint4_z) + {} + v_uint32x4(vec_bint4 v) : val(vec_uint4_c(v)) + {} + v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3)) + {} + uint get0() const + { return vec_extract(val, 0); } +}; + +struct v_int32x4 +{ + typedef int lane_type; + enum { nlanes = 4 }; + vec_int4 val; + + explicit v_int32x4(const vec_int4& v) : val(v) + {} + v_int32x4() : val(vec_int4_z) + {} + v_int32x4(vec_bint4 v) : val(vec_int4_c(v)) + {} + v_int32x4(int v0, int v1, int v2, int v3) : val(vec_int4_set(v0, v1, v2, v3)) + {} + int get0() const + { return vec_extract(val, 0); } +}; + +struct v_float32x4 +{ + typedef float lane_type; + enum { nlanes = 4 }; + vec_float4 val; + + explicit v_float32x4(const vec_float4& v) : val(v) + {} + v_float32x4() : val(vec_float4_z) + {} + v_float32x4(vec_bint4 v) : val(vec_float4_c(v)) + {} + v_float32x4(float v0, float v1, float v2, float v3) : val(vec_float4_set(v0, v1, v2, v3)) + {} + float get0() const + { return vec_extract(val, 0); } +}; + +struct v_uint64x2 +{ + typedef uint64 lane_type; + enum { nlanes = 2 }; + vec_udword2 val; + + explicit v_uint64x2(const vec_udword2& v) : val(v) + {} + v_uint64x2() : val(vec_udword2_z) + {} + v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v)) + {} + v_uint64x2(uint64 v0, uint64 v1) : val(vec_udword2_set(v0, v1)) + {} + uint64 get0() const + { return vec_extract(val, 0); } +}; + +struct v_int64x2 +{ + typedef int64 lane_type; + enum { nlanes = 2 }; + vec_dword2 val; + + explicit v_int64x2(const vec_dword2& v) : val(v) + {} + v_int64x2() : val(vec_dword2_z) + {} + v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v)) + {} + v_int64x2(int64 v0, int64 v1) : val(vec_dword2_set(v0, v1)) + {} + int64 get0() const + { return vec_extract(val, 0); } +}; + +struct v_float64x2 +{ + typedef double lane_type; + enum { nlanes = 2 }; + vec_double2 val; + + explicit v_float64x2(const vec_double2& v) : val(v) + {} + v_float64x2() : val(vec_double2_z) + {} + v_float64x2(vec_bdword2 v) : val(vec_double2_c(v)) + {} + v_float64x2(double v0, double v1) : val(vec_double2_set(v0, v1)) + {} + double get0() const + { return vec_extract(val, 0); } +}; + +//////////////// Load and store operations /////////////// + +/* + * clang-5 aborted during parse "vec_xxx_c" only if it's + * inside a function template which is defined by preprocessor macro. + * + * if vec_xxx_c defined as C++ cast, clang-5 will pass it +*/ +#define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \ +inline _Tpvec v_setzero_##suffix() { return _Tpvec(); } \ +inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \ +template inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \ +{ return _Tpvec((cast)a.val); } + +OPENCV_HAL_IMPL_VSX_INITVEC(v_uint8x16, uchar, u8, vec_uchar16) +OPENCV_HAL_IMPL_VSX_INITVEC(v_int8x16, schar, s8, vec_char16) +OPENCV_HAL_IMPL_VSX_INITVEC(v_uint16x8, ushort, u16, vec_ushort8) +OPENCV_HAL_IMPL_VSX_INITVEC(v_int16x8, short, s16, vec_short8) +OPENCV_HAL_IMPL_VSX_INITVEC(v_uint32x4, uint, u32, vec_uint4) +OPENCV_HAL_IMPL_VSX_INITVEC(v_int32x4, int, s32, vec_int4) +OPENCV_HAL_IMPL_VSX_INITVEC(v_uint64x2, uint64, u64, vec_udword2) +OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2) +OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4) +OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2) + +#define OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(_Tpvec, _Tp, ld_func, st_func) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(ld_func(0, ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(ld_func(0, ptr)); } \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ st_func(a.val, 0, ptr); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ st_func(a.val, 0, ptr); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ vec_st_l8(a.val, ptr); } \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ vec_st_h8(a.val, ptr); } + +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint8x16, uchar, vsx_ld, vsx_st) +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int8x16, schar, vsx_ld, vsx_st) +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint16x8, ushort, vsx_ld, vsx_st) +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int16x8, short, vsx_ld, vsx_st) +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint32x4, uint, vsx_ld, vsx_st) +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int32x4, int, vsx_ld, vsx_st) +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_float32x4, float, vsx_ld, vsx_st) +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_float64x2, double, vsx_ld, vsx_st) +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_uint64x2, uint64, vsx_ld2, vsx_st2) +OPENCV_HAL_IMPL_VSX_LOADSTORE_INT_OP(v_int64x2, int64, vsx_ld2, vsx_st2) + +//////////////// Value reordering /////////////// + +/* de&interleave */ +#define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec) \ +inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \ +{ vec_ld_deinterleave(ptr, a.val, b.val);} \ +inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, \ + _Tpvec& b, _Tpvec& c) \ +{ vec_ld_deinterleave(ptr, a.val, b.val, c.val); } \ +inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b, \ + _Tpvec& c, _Tpvec& d) \ +{ vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); } \ +inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \ +{ vec_st_interleave(a.val, b.val, ptr); } \ +inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, \ + const _Tpvec& b, const _Tpvec& c) \ +{ vec_st_interleave(a.val, b.val, c.val, ptr); } \ +inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, \ + const _Tpvec& c, const _Tpvec& d) \ +{ vec_st_interleave(a.val, b.val, c.val, d.val, ptr); } + +OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(schar, v_int8x16) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(ushort, v_uint16x8) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(short, v_int16x8) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2) + +/* Expand */ +#define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \ +inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ +{ \ + b0.val = fh(a.val); \ + b1.val = fl(a.val); \ +} \ +inline _Tpwvec v_load_expand(const _Tp* ptr) \ +{ return _Tpwvec(fh(vsx_ld(0, ptr))); } + +OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu) +OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh) +OPENCV_HAL_IMPL_VSX_EXPAND(v_uint16x8, v_uint32x4, ushort, vec_unpacklu, vec_unpackhu) +OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh) +OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu) +OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh) + +inline v_uint32x4 v_load_expand_q(const uchar* ptr) +{ return v_uint32x4(vec_ld_buw(ptr)); } + +inline v_int32x4 v_load_expand_q(const schar* ptr) +{ return v_int32x4(vec_ld_bsw(ptr)); } + +/* pack */ +#define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \ +inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \ +{ \ + return _Tpvec(pkfnc(a.val, b.val)); \ +} \ +inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ +{ \ + vec_st_l8(pkfnc(a.val, a.val), ptr); \ +} \ +template \ +inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \ +{ \ + const __vector _Tpvn vn = vec_splats((_Tpvn)n); \ + const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \ + return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn))); \ +} \ +template \ +inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ +{ \ + const __vector _Tpvn vn = vec_splats((_Tpvn)n); \ + const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1))); \ + vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr); \ +} + +OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_uint16x8, unsigned short, unsigned short, + vec_sr, vec_packs, vec_adds, pack) +OPENCV_HAL_IMPL_VSX_PACK(v_int8x16, schar, v_int16x8, unsigned short, short, + vec_sra, vec_packs, vec_adds, pack) + +OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_uint32x4, unsigned int, unsigned int, + vec_sr, vec_packs, vec_add, pack) +OPENCV_HAL_IMPL_VSX_PACK(v_int16x8, short, v_int32x4, unsigned int, int, + vec_sra, vec_packs, vec_add, pack) + +OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_uint64x2, unsigned long long, unsigned long long, + vec_sr, vec_packs, vec_add, pack) +OPENCV_HAL_IMPL_VSX_PACK(v_int32x4, int, v_int64x2, unsigned long long, long long, + vec_sra, vec_packs, vec_add, pack) + +OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_int16x8, unsigned short, short, + vec_sra, vec_packsu, vec_adds, pack_u) +OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int, + vec_sra, vec_packsu, vec_add, pack_u) +OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long, + vec_sra, vec_packsu, vec_add, pack_u) + +/* Recombine */ +template +inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) +{ + b0.val = vec_mergeh(a0.val, a1.val); + b1.val = vec_mergel(a0.val, a1.val); +} + +template +inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) +{ return _Tpvec(vec_mergesql(a.val, b.val)); } + +template +inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) +{ return _Tpvec(vec_mergesqh(a.val, b.val)); } + +template +inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) +{ + c.val = vec_mergesqh(a.val, b.val); + d.val = vec_mergesql(a.val, b.val); +} + +/* Extract */ +template +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) +{ + const int w = sizeof(typename _Tpvec::lane_type); + const int n = _Tpvec::nlanes; + const unsigned int sf = ((w * n) - (s * w)); + if (s == 0) + return _Tpvec(a.val); + else if (sf > 15) + return _Tpvec(); + // bitwise it just to make xlc happy + return _Tpvec(vec_sld(b.val, a.val, sf & 15)); +} + +#define OPENCV_HAL_IMPL_VSX_EXTRACT_2(_Tpvec) \ +template \ +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ +{ \ + switch(s) { \ + case 0: return _Tpvec(a.val); \ + case 2: return _Tpvec(b.val); \ + case 1: return _Tpvec(vec_sldw(b.val, a.val, 2)); \ + default: return _Tpvec(); \ + } \ +} +OPENCV_HAL_IMPL_VSX_EXTRACT_2(v_uint64x2) +OPENCV_HAL_IMPL_VSX_EXTRACT_2(v_int64x2) + + +////////// Arithmetic, bitwise and comparison operations ///////// + +/* Element-wise binary and unary operations */ +/** Arithmetics **/ +#define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin) \ +inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(intrin(a.val, b.val)); } \ +inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ +{ a.val = intrin(a.val, b.val); return a; } + +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint16x8, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs) +OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int16x8, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div) +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul) +OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div) +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub) +OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub) + +inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, v_int32x4& c, v_int32x4& d) +{ + c.val = vec_mul(vec_unpackh(a.val), vec_unpackh(b.val)); + d.val = vec_mul(vec_unpackl(a.val), vec_unpackl(b.val)); +} +inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, v_uint32x4& c, v_uint32x4& d) +{ + c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)); + d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)); +} +inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d) +{ + c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val)); + d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val)); +} + +/** Non-saturating arithmetics **/ +#define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin) \ +template \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add) +OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub) + +/** Bitwise shifts **/ +#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpuvec, splfunc) \ +inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ +{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \ +inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ +{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); } \ +template inline _Tpuvec v_shl(const _Tpuvec& a) \ +{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \ +template inline _Tpuvec v_shr(const _Tpuvec& a) \ +{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); } + +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_uchar16_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_uchar16_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_ushort8_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_ushort8_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_uint4_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_uint4_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_udword2_sp) +OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_udword2_sp) + +/** Bitwise logic **/ +#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \ +OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and) \ +OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or) \ +OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor) \ +inline _Tpvec operator ~ (const _Tpvec& a) \ +{ return _Tpvec(vec_not(a.val)); } + +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16) +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int8x16) +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint16x8) +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int16x8) +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint32x4) +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int32x4) +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint64x2) +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int64x2) +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float32x4) +OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float64x2) + +/** Bitwise select **/ +#define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); } + +OPENCV_HAL_IMPL_VSX_SELECT(v_uint8x16, vec_bchar16_c) +OPENCV_HAL_IMPL_VSX_SELECT(v_int8x16, vec_bchar16_c) +OPENCV_HAL_IMPL_VSX_SELECT(v_uint16x8, vec_bshort8_c) +OPENCV_HAL_IMPL_VSX_SELECT(v_int16x8, vec_bshort8_c) +OPENCV_HAL_IMPL_VSX_SELECT(v_uint32x4, vec_bint4_c) +OPENCV_HAL_IMPL_VSX_SELECT(v_int32x4, vec_bint4_c) +OPENCV_HAL_IMPL_VSX_SELECT(v_float32x4, vec_bint4_c) +OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c) + +/** Comparison **/ +#define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vec_cmpeq(a.val, b.val)); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vec_cmpne(a.val, b.val)); } \ +inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vec_cmplt(a.val, b.val)); } \ +inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vec_cmpgt(a.val, b.val)); } \ +inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vec_cmple(a.val, b.val)); } \ +inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vec_cmpge(a.val, b.val)); } + +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16) +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int8x16) +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint16x8) +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int16x8) +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint32x4) +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int32x4) +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float32x4) +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2) +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2) +OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2) + +/** min/max **/ +OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min) +OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max) + +////////// Reduce and mask ///////// + +/** Reduce **/ +inline short v_reduce_sum(const v_int16x8& a) +{ + const vec_int4 zero = vec_int4_z; + return saturate_cast(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3)); +} +inline ushort v_reduce_sum(const v_uint16x8& a) +{ + const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8)))); + return saturate_cast(vec_extract(vec_sums(v4, vec_int4_z), 3)); +} + +#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \ +inline scalartype v_reduce_##suffix(const _Tpvec& a) \ +{ \ + const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \ + return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0); \ +} +OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, sum, vec_add) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, max, vec_max) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, min, vec_min) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, sum, vec_add) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, max, vec_max) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, min, vec_min) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min) + +#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \ +inline scalartype v_reduce_##suffix(const _Tpvec& a) \ +{ \ + _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \ + rs = func(rs, vec_sld(rs, rs, 4)); \ + return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0); \ +} +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, max, vec_max) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max) +OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min) + +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val)); + ac = vec_add(ac, vec_sld(ac, ac, 8)); + + vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val)); + bd = vec_add(bd, vec_sld(bd, bd, 8)); + return v_float32x4(vec_mergeh(ac, bd)); +} + +/** Popcount **/ +#define OPENCV_HAL_IMPL_VSX_POPCOUNT_8(_Tpvec) \ +inline v_uint32x4 v_popcount(const _Tpvec& a) \ +{ \ + vec_uchar16 v16 = vec_popcntu(a.val); \ + vec_ushort8 v8 = vec_add(vec_unpacklu(v16), vec_unpackhu(v16)); \ + return v_uint32x4(vec_add(vec_unpacklu(v8), vec_unpackhu(v8))); \ +} +OPENCV_HAL_IMPL_VSX_POPCOUNT_8(v_int8x16) +OPENCV_HAL_IMPL_VSX_POPCOUNT_8(v_uint8x16) + +#define OPENCV_HAL_IMPL_VSX_POPCOUNT_16(_Tpvec) \ +inline v_uint32x4 v_popcount(const _Tpvec& a) \ +{ \ + vec_ushort8 v8 = vec_popcntu(a.val); \ + return v_uint32x4(vec_add(vec_unpacklu(v8), vec_unpackhu(v8))); \ +} +OPENCV_HAL_IMPL_VSX_POPCOUNT_16(v_int16x8) +OPENCV_HAL_IMPL_VSX_POPCOUNT_16(v_uint16x8) + +#define OPENCV_HAL_IMPL_VSX_POPCOUNT_32(_Tpvec) \ +inline v_uint32x4 v_popcount(const _Tpvec& a) \ +{ return v_uint32x4(vec_popcntu(a.val)); } + +OPENCV_HAL_IMPL_VSX_POPCOUNT_32(v_int32x4) +OPENCV_HAL_IMPL_VSX_POPCOUNT_32(v_uint32x4) + +/** Mask **/ +inline int v_signmask(const v_uint8x16& a) +{ + vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7)); + static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; + sv = vec_sl(sv, slm); + vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z); + static const vec_uint4 slm4 = {0, 0, 8, 8}; + sv4 = vec_sl(sv4, slm4); + return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3); +} +inline int v_signmask(const v_int8x16& a) +{ return v_signmask(v_reinterpret_as_u8(a)); } + +inline int v_signmask(const v_int16x8& a) +{ + static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7}; + vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15)); + sv = vec_sl(sv, slm); + vec_int4 svi = vec_int4_z; + svi = vec_sums(vec_sum4s(sv, svi), svi); + return vec_extract(svi, 3); +} +inline int v_signmask(const v_uint16x8& a) +{ return v_signmask(v_reinterpret_as_s16(a)); } + +inline int v_signmask(const v_int32x4& a) +{ + static const vec_uint4 slm = {0, 1, 2, 3}; + vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31)); + sv = vec_sl(sv, slm); + sv = vec_sums(sv, vec_int4_z); + return vec_extract(sv, 3); +} +inline int v_signmask(const v_uint32x4& a) +{ return v_signmask(v_reinterpret_as_s32(a)); } +inline int v_signmask(const v_float32x4& a) +{ return v_signmask(v_reinterpret_as_s32(a)); } + +inline int v_signmask(const v_int64x2& a) +{ + const vec_dword2 sv = vec_sr(a.val, vec_udword2_sp(63)); + return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1; +} +inline int v_signmask(const v_uint64x2& a) +{ return v_signmask(v_reinterpret_as_s64(a)); } +inline int v_signmask(const v_float64x2& a) +{ return v_signmask(v_reinterpret_as_s64(a)); } + + +template +inline bool v_check_all(const _Tpvec& a) +{ return vec_all_lt(a.val, _Tpvec().val);} +inline bool v_check_all(const v_uint8x16 &a) +{ return v_check_all(v_reinterpret_as_s8(a)); } +inline bool v_check_all(const v_uint16x8 &a) +{ return v_check_all(v_reinterpret_as_s16(a)); } +inline bool v_check_all(const v_uint32x4 &a) +{ return v_check_all(v_reinterpret_as_s32(a)); } + +template +inline bool v_check_any(const _Tpvec& a) +{ return vec_any_lt(a.val, _Tpvec().val);} +inline bool v_check_any(const v_uint8x16 &a) +{ return v_check_any(v_reinterpret_as_s8(a)); } +inline bool v_check_any(const v_uint16x8 &a) +{ return v_check_any(v_reinterpret_as_s16(a)); } +inline bool v_check_any(const v_uint32x4 &a) +{ return v_check_any(v_reinterpret_as_s32(a)); } + +////////// Other math ///////// + +/** Some frequent operations **/ +inline v_float32x4 v_sqrt(const v_float32x4& x) +{ return v_float32x4(vec_sqrt(x.val)); } +inline v_float64x2 v_sqrt(const v_float64x2& x) +{ return v_float64x2(vec_sqrt(x.val)); } + +inline v_float32x4 v_invsqrt(const v_float32x4& x) +{ return v_float32x4(vec_rsqrt(x.val)); } +inline v_float64x2 v_invsqrt(const v_float64x2& x) +{ return v_float64x2(vec_rsqrt(x.val)); } + +#define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec) \ +inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \ +inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); } \ +inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ +{ return _Tpvec(vec_madd(a.val, b.val, c.val)); } + +OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4) +OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2) + +// TODO: exp, log, sin, cos + +/** Absolute values **/ +inline v_uint8x16 v_abs(const v_int8x16& x) +{ return v_uint8x16(vec_uchar16_c(vec_abs(x.val))); } + +inline v_uint16x8 v_abs(const v_int16x8& x) +{ return v_uint16x8(vec_ushort8_c(vec_abs(x.val))); } + +inline v_uint32x4 v_abs(const v_int32x4& x) +{ return v_uint32x4(vec_uint4_c(vec_abs(x.val))); } + +inline v_float32x4 v_abs(const v_float32x4& x) +{ return v_float32x4(vec_abs(x.val)); } + +inline v_float64x2 v_abs(const v_float64x2& x) +{ return v_float64x2(vec_abs(x.val)); } + +OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd) + +#define OPENCV_HAL_IMPL_VSX_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \ +inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec2(cast(intrin(a.val, b.val))); } + +OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int8x16, v_uint8x16, vec_uchar16_c, v_absdiff, vec_absd) +OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int16x8, v_uint16x8, vec_ushort8_c, v_absdiff, vec_absd) +OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int32x4, v_uint32x4, vec_uint4_c, v_absdiff, vec_absd) +OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, vec_absd) + +////////// Conversions ///////// + +/** Rounding **/ +inline v_int32x4 v_round(const v_float32x4& a) +{ return v_int32x4(vec_cts(vec_round(a.val), 0)); } + +inline v_int32x4 v_round(const v_float64x2& a) +{ + static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; + return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_round(a.val)), perm)); +} + +inline v_int32x4 v_floor(const v_float32x4& a) +{ return v_int32x4(vec_cts(vec_floor(a.val), 0)); } + +inline v_int32x4 v_floor(const v_float64x2& a) +{ + static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; + return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_floor(a.val)), perm)); +} + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ return v_int32x4(vec_cts(vec_ceil(a.val), 0)); } + +inline v_int32x4 v_ceil(const v_float64x2& a) +{ + static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; + return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_ceil(a.val)), perm)); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ return v_int32x4(vec_cts(a.val, 0)); } + +inline v_int32x4 v_trunc(const v_float64x2& a) +{ + static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; + return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(a.val), perm)); +} + +/** To float **/ +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ return v_float32x4(vec_ctf(a.val, 0)); } + +inline v_float32x4 v_cvt_f32(const v_float64x2& a) +{ + static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0}; + return v_float32x4(vec_perm(vec_float4_z, vec_cvf(a.val), perm)); +} +inline v_float64x2 v_cvt_f64(const v_int32x4& a) +{ + return v_float64x2(vec_ctd(vec_mergeh(a.val, a.val), 0)); +} +inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) +{ + return v_float64x2(vec_ctd(vec_mergel(a.val, a.val), 0)); +} +inline v_float64x2 v_cvt_f64(const v_float32x4& a) +{ + return v_float64x2(vec_cvf(vec_mergeh(a.val, a.val))); +} +inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) +{ + return v_float64x2(vec_cvf(vec_mergel(a.val, a.val))); +} + +/** Reinterpret **/ +/** its up there with load and store operations **/ + +////////// Matrix operations ///////// + +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); } + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + const vec_float4 v0 = vec_splat(v.val, 0); + const vec_float4 v1 = vec_splat(v.val, 1); + const vec_float4 v2 = vec_splat(v.val, 2); + const vec_float4 v3 = vec_splat(v.val, 3); + return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val))))); +} + +#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \ +inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \ +{ \ + _Tpvec2 a02 = vec_mergeh(a0.val, a2.val); \ + _Tpvec2 a13 = vec_mergeh(a1.val, a3.val); \ + b0.val = vec_mergeh(a02, a13); \ + b1.val = vec_mergel(a02, a13); \ + a02 = vec_mergel(a0.val, a2.val); \ + a13 = vec_mergel(a1.val, a3.val); \ + b2.val = vec_mergeh(a02, a13); \ + b3.val = vec_mergel(a02, a13); \ +} +OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4) +OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4) +OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4) + +//! @name Check SIMD support +//! @{ +//! @brief Check CPU capability of SIMD operation +static inline bool hasSIMD128() +{ + return (CV_CPU_HAS_SUPPORT_VSX) ? true : false; +} + +//! @} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} + +#endif // OPENCV_HAL_VSX_HPP diff --git a/modules/core/include/opencv2/core/vsx_utils.hpp b/modules/core/include/opencv2/core/vsx_utils.hpp new file mode 100644 index 0000000000..3ce190b9b6 --- /dev/null +++ b/modules/core/include/opencv2/core/vsx_utils.hpp @@ -0,0 +1,945 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef OPENCV_HAL_VSX_UTILS_HPP +#define OPENCV_HAL_VSX_UTILS_HPP + +#include "opencv2/core/cvdef.h" + +//! @addtogroup core_utils_vsx +//! @{ +#if CV_VSX + +#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline)) + +#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \ +FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); } + +#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \ +FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); } + +#define VSX_IMPL_PERM(rt, fnm, ...) \ +FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \ + { static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); } + +#define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v} +#define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v} +#define __VSX_S4__(c, v) (c){v, v, v, v} +#define __VSX_S2__(c, v) (c){v, v} + +typedef __vector unsigned char vec_uchar16; +#define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__} +#define vec_uchar16_sp(c) (__VSX_S16__(vec_uchar16, c)) +#define vec_uchar16_c(v) ((vec_uchar16)(v)) +#define vec_uchar16_mx vec_uchar16_sp(0xFF) +#define vec_uchar16_mn vec_uchar16_sp(0) +#define vec_uchar16_z vec_uchar16_mn + +typedef __vector signed char vec_char16; +#define vec_char16_set(...) (vec_char16){__VA_ARGS__} +#define vec_char16_sp(c) (__VSX_S16__(vec_char16, c)) +#define vec_char16_c(v) ((vec_char16)(v)) +#define vec_char16_mx vec_char16_sp(0x7F) +#define vec_char16_mn vec_char16_sp(-0x7F-1) +#define vec_char16_z vec_char16_sp(0) + +typedef __vector unsigned short vec_ushort8; +#define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__} +#define vec_ushort8_sp(c) (__VSX_S8__(vec_ushort8, c)) +#define vec_ushort8_c(v) ((vec_ushort8)(v)) +#define vec_ushort8_mx vec_ushort8_sp(0xFFFF) +#define vec_ushort8_mn vec_ushort8_sp(0) +#define vec_ushort8_z vec_ushort8_mn + +typedef __vector signed short vec_short8; +#define vec_short8_set(...) (vec_short8){__VA_ARGS__} +#define vec_short8_sp(c) (__VSX_S8__(vec_short8, c)) +#define vec_short8_c(v) ((vec_short8)(v)) +#define vec_short8_mx vec_short8_sp(0x7FFF) +#define vec_short8_mn vec_short8_sp(-0x7FFF-1) +#define vec_short8_z vec_short8_sp(0) + +typedef __vector unsigned int vec_uint4; +#define vec_uint4_set(...) (vec_uint4){__VA_ARGS__} +#define vec_uint4_sp(c) (__VSX_S4__(vec_uint4, c)) +#define vec_uint4_c(v) ((vec_uint4)(v)) +#define vec_uint4_mx vec_uint4_sp(0xFFFFFFFFU) +#define vec_uint4_mn vec_uint4_sp(0) +#define vec_uint4_z vec_uint4_mn + +typedef __vector signed int vec_int4; +#define vec_int4_set(...) (vec_int4){__VA_ARGS__} +#define vec_int4_sp(c) (__VSX_S4__(vec_int4, c)) +#define vec_int4_c(v) ((vec_int4)(v)) +#define vec_int4_mx vec_int4_sp(0x7FFFFFFF) +#define vec_int4_mn vec_int4_sp(-0x7FFFFFFF-1) +#define vec_int4_z vec_int4_sp(0) + +typedef __vector float vec_float4; +#define vec_float4_set(...) (vec_float4){__VA_ARGS__} +#define vec_float4_sp(c) (__VSX_S4__(vec_float4, c)) +#define vec_float4_c(v) ((vec_float4)(v)) +#define vec_float4_mx vec_float4_sp(3.40282347E+38F) +#define vec_float4_mn vec_float4_sp(1.17549435E-38F) +#define vec_float4_z vec_float4_sp(0) + +typedef __vector unsigned long long vec_udword2; +#define vec_udword2_set(...) (vec_udword2){__VA_ARGS__} +#define vec_udword2_sp(c) (__VSX_S2__(vec_udword2, c)) +#define vec_udword2_c(v) ((vec_udword2)(v)) +#define vec_udword2_mx vec_udword2_sp(18446744073709551615ULL) +#define vec_udword2_mn vec_udword2_sp(0) +#define vec_udword2_z vec_udword2_mn + +typedef __vector signed long long vec_dword2; +#define vec_dword2_set(...) (vec_dword2){__VA_ARGS__} +#define vec_dword2_sp(c) (__VSX_S2__(vec_dword2, c)) +#define vec_dword2_c(v) ((vec_dword2)(v)) +#define vec_dword2_mx vec_dword2_sp(9223372036854775807LL) +#define vec_dword2_mn vec_dword2_sp(-9223372036854775807LL-1) +#define vec_dword2_z vec_dword2_sp(0) + +typedef __vector double vec_double2; +#define vec_double2_set(...) (vec_double2){__VA_ARGS__} +#define vec_double2_c(v) ((vec_double2)(v)) +#define vec_double2_sp(c) (__VSX_S2__(vec_double2, c)) +#define vec_double2_mx vec_double2_sp(1.7976931348623157E+308) +#define vec_double2_mn vec_double2_sp(2.2250738585072014E-308) +#define vec_double2_z vec_double2_sp(0) + +#define vec_bchar16 __vector __bool char +#define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__} +#define vec_bchar16_c(v) ((vec_bchar16)(v)) +#define vec_bchar16_f (__VSX_S16__(vec_bchar16, 0)) +#define vec_bchar16_t (__VSX_S16__(vec_bchar16, 1)) + +#define vec_bshort8 __vector __bool short +#define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__} +#define vec_bshort8_c(v) ((vec_bshort8)(v)) +#define vec_bshort8_f (__VSX_S8__(vec_bshort8, 0)) +#define vec_bshort8_t (__VSX_S8__(vec_bshort8, 1)) + +#define vec_bint4 __vector __bool int +#define vec_bint4_set(...) (vec_bint4){__VA_ARGS__} +#define vec_bint4_c(v) ((vec_bint4)(v)) +#define vec_bint4_f (__VSX_S4__(vec_bint4, 0)) +#define vec_bint4_t (__VSX_S4__(vec_bint4, 1)) + +#define vec_bdword2 __vector __bool long long +#define vec_bdword2_set(...) (vec_bdword2){__VA_ARGS__} +#define vec_bdword2_c(v) ((vec_bdword2)(v)) +#define vec_bdword2_f (__VSX_S2__(vec_bdword2, 0)) +#define vec_bdword2_t (__VSX_S2__(vec_bdword2, 1)) + +/* + * GCC VSX compatibility +**/ +#if defined(__GNUG__) && !defined(__IBMCPP__) && !defined(__clang__) + +// inline asm helper +#define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \ +FORCE_INLINE(rt) fnm(const rg& a) \ + { rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "="#rto (rs) : #rgo (a)); return rs; } + +#define VSX_IMPL_1VRG(rt, rg, opc, fnm) \ +FORCE_INLINE(rt) fnm(const rg& a) \ + { rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; } + +#define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm) \ +FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \ + { rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; } + +#define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm) + +#if __GNUG__ < 7 +/* up to GCC 6 vec_mul only supports precisions and llong */ +# ifdef vec_mul +# undef vec_mul +# endif +/* + * there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07, + * XLC Implement it by using instruction "multiply even", "multiply oden" and "permute" + * todo: Do I need to support 8-bit ? +**/ +# define VSX_IMPL_MULH(Tvec, Tcast) \ + FORCE_INLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \ + { \ + static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21, \ + 8, 9, 24, 25, 12, 13, 28, 29}; \ + return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \ + } + VSX_IMPL_MULH(vec_short8, vec_short8_c) + VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c) + /* vmuluwm can be used for unsigned or signed integers, that's what they said */ + VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul) + VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul) + /* redirect to GCC builtin vec_mul, since it already supports precisions and llong */ + VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul) + VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul) + VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul) + VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul) +#endif // __GNUG__ < 7 + +#if __GNUG__ < 6 +/* + * Instruction "compare greater than or equal" in ISA 2.07 only supports single + * and double precision. + * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR. +**/ +# ifdef vec_cmpge +# undef vec_cmpge +# endif +# ifdef vec_cmple +# undef vec_cmple +# endif +# define vec_cmple(a, b) vec_cmpge(b, a) +# define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \ + VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm) + + VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge) + VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge) + VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge) + VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge) + VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge) + VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge) + VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge) + VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge) + /* redirect to GCC builtin cmpge, since it already supports precisions */ + VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge) + VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge) + +// up to gcc5 vec_nor doesn't support bool long long +# undef vec_nor +template +VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor) + +FORCE_INLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b) +{ return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); } +#endif // __GNUG__ < 6 + +// vector population count +#ifndef vec_popcnt + VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcnt) + VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcnt) + VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcnt) + VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcnt) + VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcnt) + VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcnt) + VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcnt) + VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcnt) +#endif // vec_popcnt + +#if __GNUG__ < 5 +// vec_xxpermdi in gcc4 missing little-endian supports just like clang +# define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1))) +// vec_packs doesn't support double words in gcc4 +# undef vec_packs +VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs) +VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs) +VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs) +VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs) +VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs) +VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs) +#else +# define vec_permi vec_xxpermdi +#endif + +// converts between single and double-precision +#ifndef vec_cvf + VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp) + FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a) + { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); } +#endif + +// converts 32 and 64 bit integers to double-precision +#ifndef vec_ctd +# define vec_ctd(a, b) __vec_ctd(a) + VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, __vec_ctd) + VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, __vec_ctd) + VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, __vec_ctd) + VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, __vec_ctd) +#endif + +// shift left double by word immediate +#ifndef vec_sldw +# define vec_sldw __builtin_vsx_xxsldwi +#endif + +// just in case if GCC doesn't define it +#ifndef vec_xl +# define vec_xl vec_vsx_ld +# define vec_xst vec_vsx_st +#endif + +#endif // GCC VSX compatibility + +/* + * CLANG VSX compatibility +**/ +#if defined(__clang__) && !defined(__IBMCPP__) + +/* + * CLANG doesn't support %x in the inline asm template which fixes register number + * when using any of the register constraints wa, wd, wf + * + * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html + * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837 + * + * So we're not able to use inline asm and only use built-in functions that CLANG supports +*/ + +#if __clang_major__ < 5 +// implement vec_permi in a dirty way +# define VSX_IMPL_CLANG_4_PERMI(Tvec) \ + FORCE_INLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c) \ + { \ + switch (c) \ + { \ + case 0: \ + return vec_mergeh(a, b); \ + case 1: \ + return vec_mergel(vec_mergeh(a, a), b); \ + case 2: \ + return vec_mergeh(vec_mergel(a, a), b); \ + default: \ + return vec_mergel(a, b); \ + } \ + } + VSX_IMPL_CLANG_4_PERMI(vec_udword2) + VSX_IMPL_CLANG_4_PERMI(vec_dword2) + VSX_IMPL_CLANG_4_PERMI(vec_double2) + +// vec_xxsldwi is missing in clang 4 +# define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4) +#else +// vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4 +# define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1))) +#endif // __clang_major__ < 5 + +// shift left double by word immediate +#ifndef vec_sldw +# define vec_sldw vec_xxsldwi +#endif + +/* converts between single and double precision */ +#ifndef vec_cvf + VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp) + FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a) + { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); } +#endif + +/* converts 32 and 64 bit integers to double-precision */ +#ifndef vec_ctd +# define vec_ctd(a, b) __vec_ctd(a) + VSX_REDIRECT_1RG(vec_double2, vec_int4, __vec_ctd, __builtin_vsx_xvcvsxwdp) + VSX_REDIRECT_1RG(vec_double2, vec_uint4, __vec_ctd, __builtin_vsx_xvcvuxwdp) + // implement vec_ctd for double word in a dirty way since we are missing builtin xvcvsxddp, xvcvuxddp + // please try to avoid using it for double words + FORCE_INLINE(vec_double2) __vec_ctd(const vec_dword2& a) + { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); } + FORCE_INLINE(vec_double2) __vec_ctd(const vec_udword2& a) + { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); } +#endif + +// Implement vec_rsqrt since clang only supports vec_rsqrte +#ifndef vec_rsqrt + FORCE_INLINE(vec_float4) vec_rsqrt(const vec_float4& a) + { return vec_div(vec_float4_sp(1), vec_sqrt(a)); } + + FORCE_INLINE(vec_double2) vec_rsqrt(const vec_double2& a) + { return vec_div(vec_double2_sp(1), vec_sqrt(a)); } +#endif + + +/* + * __builtin_altivec_vctsxs in clang 5 and 6 causes ambiguous which used by vec_cts + * so we just redefine it and cast it +*/ +#if __clang_major__ > 4 +# undef vec_cts +# define vec_cts(__a, __b) \ + _Generic((__a), vector float \ + : (vector signed int)__builtin_altivec_vctsxs((__a), (__b)), vector double \ + : __extension__({ \ + vector double __ret = \ + (__a) * \ + (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \ + << 52); \ + __builtin_convertvector(__ret, vector signed long long); \ + })) +#endif // __clang_major__ > 4 + +#endif // CLANG VSX compatibility + +/* + * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer) + * load and set using offset depend on the pointer type + * + * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer) + * load and set using offset depend on fixed bytes size + * + * Note: In clang vec_xl and vec_xst fails to load unaligned addresses + * so we are using vec_vsx_ld, vec_vsx_st instead +*/ + +#if defined(__clang__) && !defined(__IBMCPP__) +# define vsx_ldf vec_vsx_ld +# define vsx_stf vec_vsx_st +#else // GCC , XLC +# define vsx_ldf vec_xl +# define vsx_stf vec_xst +#endif + +#define VSX_OFFSET(o, p) ((o) * sizeof(*(p))) +#define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p) +#define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p) + +/* + * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words + * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long + * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses + * + * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long +*/ +#if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__) + FORCE_INLINE(vec_udword2) vsx_ld2(long o, const uint64* p) + { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); } + + FORCE_INLINE(vec_dword2) vsx_ld2(long o, const int64* p) + { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); } + + FORCE_INLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p) + { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); } + + FORCE_INLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p) + { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); } +#else // XLC + FORCE_INLINE(vec_udword2) vsx_ld2(long o, const uint64* p) + { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); } + + FORCE_INLINE(vec_dword2) vsx_ld2(long o, const int64* p) + { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); } + + FORCE_INLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p) + { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); } + + FORCE_INLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p) + { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); } +#endif + +#if defined(__clang__) || defined(__IBMCPP__) + // gcc can find his way in casting log int and XLC, CLANG ambiguous + FORCE_INLINE(vec_udword2) vec_splats(uint64 v) + { return vec_splats((unsigned long long) v); } + + FORCE_INLINE(vec_dword2) vec_splats(int64 v) + { return vec_splats((long long) v); } +#endif + +// Implement store vector bool char for XLC +#if defined(__IBMCPP__) && defined(__clang__) + FORCE_INLINE(void) vec_xst(const vec_bchar16 &vec, long o, uchar* p) + { vec_xst(vec_uchar16_c(vec), VSX_OFFSET(o, p), p); } +#endif + +// Working around vec_popcnt compatibility +/* + * vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt + * + * use vec_popcntu instead to deal with it +*/ +#if defined(__clang__) && !defined(__IBMCPP__) +# define VSX_IMPL_CLANG_POPCNTU(Tvec, Tvec2, ucast) \ + FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a) \ + { return ucast(vec_popcnt(a)); } + + VSX_IMPL_CLANG_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c); + VSX_IMPL_CLANG_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c); + VSX_IMPL_CLANG_POPCNTU(vec_uint4, vec_int4, vec_uint4_c); + // redirect unsigned types + VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt) + VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt) + VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt) +#else +# define vec_popcntu vec_popcnt +#endif + +// Working around vec_cts compatibility +/* + * vec_cts in gcc and clang converts single-precision to signed fixed-point word + * and from double-precision to signed doubleword, also there's no implement for vec_ctsl + * + * vec_cts in xlc converts single and double precision to signed fixed-point word + * and xlc has vec_ctsl which converts single and double precision to signed doubleword + * + * so to deal with this situation, use vec_cts only if you want to convert single-precision to signed fixed-point word + * and use vec_ctsl when you want to convert double-precision to signed doubleword + * + * Also we implemented vec_ctsw(a) to convert double-precision to signed fixed-point word +*/ + +// converts double-precision to signed doubleword for GCC and CLANG +#if !defined(vec_ctsl) && !defined(__IBMCPP__) && (defined(__GNUG__) || defined(__clang__)) +// GCC4 has incorrect results in convert to signed doubleword +# if !defined(__clang__) && __GNUG__ < 5 +# define vec_ctsl(a, b) __vec_ctsl(a) + VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, __vec_ctsl) +# else // GCC > 4 , CLANG +# define vec_ctsl vec_cts +# endif +#endif + +// converts double-precision to signed fixed-point word +#if defined(__IBMCPP__) +# define vec_ctsw(a) vec_cts(a, 0) +#else // GCC, CLANG +# define vec_ctsw(a) vec_int4_c(__builtin_vsx_xvcvdpsxws(a)) +#endif + +// load 4 unsigned bytes into uint4 vector +#define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3]) + +// load 4 signed bytes into int4 vector +#define vec_ld_bsw(p) vec_int4_set((p)[0], (p)[1], (p)[2], (p)[3]) + +// load 4 unsigned bytes into float vector +#define vec_ld_bps(p) vec_ctf(vec_ld_buw(p), 0) + +// Store lower 8 byte +#define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0) + +// Store higher 8 byte +#define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1) + +/* + * vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part + * vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part +**/ +#if defined(__clang__) && !defined(__IBMCPP__) +# define __VSX_LOAD_L8(Tvec, p) (Tvec)((vec_udword2)*((uint64*)(p))) +#else +# define __VSX_LOAD_L8(Tvec, p) *((Tvec*)(p)) +#endif + +#define VSX_IMPL_LOAD_L8(Tvec, Tp) \ +FORCE_INLINE(Tvec) vec_ld_l8(const Tp *p) \ +{ return __VSX_LOAD_L8(Tvec, p); } \ +FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p) \ +{ \ + static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000}; \ + return vec_and(vec_ld_l8(p), (Tvec)mask); \ +} +VSX_IMPL_LOAD_L8(vec_uchar16, uchar) +VSX_IMPL_LOAD_L8(vec_char16, schar) +VSX_IMPL_LOAD_L8(vec_ushort8, ushort) +VSX_IMPL_LOAD_L8(vec_short8, short) +VSX_IMPL_LOAD_L8(vec_uint4, uint) +VSX_IMPL_LOAD_L8(vec_int4, int) +VSX_IMPL_LOAD_L8(vec_float4, float) +VSX_IMPL_LOAD_L8(vec_udword2, uint64) +VSX_IMPL_LOAD_L8(vec_dword2, int64) +VSX_IMPL_LOAD_L8(vec_double2, double) + +// logical not +#define vec_not(a) vec_nor(a, a) + +// power9 yaya +// not equal +#ifndef vec_cmpne +# define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b)) +#endif + +// absoulte difference +#ifndef vec_absd +# define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b)) +#endif + +/* + * Implement vec_unpacklu and vec_unpackhu + * since vec_unpackl, vec_unpackh only support signed integers +**/ +#define VSX_IMPL_UNPACKU(rt, rg, zero) \ +FORCE_INLINE(rt) vec_unpacklu(const rg& a) \ +{ return reinterpret_cast(vec_mergel(a, zero)); } \ +FORCE_INLINE(rt) vec_unpackhu(const rg& a) \ +{ return reinterpret_cast(vec_mergeh(a, zero)); } + +VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z) +VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z) +VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z) + +/* + * Implement vec_mergesqe and vec_mergesqo + * Merges the sequence values of even and odd elements of two vectors +*/ +// 16 +#define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +#define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe) +VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo) +VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe) +VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo) +// 8 +#define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 +#define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 +VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe) +VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo) +VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe) +VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo) +// 4 +#define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +#define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe) +VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo) +VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe) +VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo) +VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe) +VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo) +// 2 +VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh) +VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel) +VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh) +VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel) +VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh) +VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel) + +/* + * Implement vec_mergesqh and vec_mergesql + * Merges the sequence most and least significant halves of two vectors +*/ +#define VSX_IMPL_MERGESQHL(Tvec) \ +FORCE_INLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b) \ +{ return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); } \ +FORCE_INLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b) \ +{ return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); } +VSX_IMPL_MERGESQHL(vec_uchar16) +VSX_IMPL_MERGESQHL(vec_char16) +VSX_IMPL_MERGESQHL(vec_ushort8) +VSX_IMPL_MERGESQHL(vec_short8) +VSX_IMPL_MERGESQHL(vec_uint4) +VSX_IMPL_MERGESQHL(vec_int4) +VSX_IMPL_MERGESQHL(vec_float4) +VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh) +VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel) +VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh) +VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel) +VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh) +VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel) + + +// 2 and 4 channels interleave for all types except 2 lanes +#define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec) \ +FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \ +{ \ + vsx_stf(vec_mergeh(a, b), 0, ptr); \ + vsx_stf(vec_mergel(a, b), 16, ptr); \ +} \ +FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \ + const Tvec& c, const Tvec& d, Tp* ptr) \ +{ \ + Tvec ac = vec_mergeh(a, c); \ + Tvec bd = vec_mergeh(b, d); \ + vsx_stf(vec_mergeh(ac, bd), 0, ptr); \ + vsx_stf(vec_mergel(ac, bd), 16, ptr); \ + ac = vec_mergel(a, c); \ + bd = vec_mergel(b, d); \ + vsx_stf(vec_mergeh(ac, bd), 32, ptr); \ + vsx_stf(vec_mergel(ac, bd), 48, ptr); \ +} +VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16) +VSX_IMPL_ST_INTERLEAVE(schar, vec_char16) +VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8) +VSX_IMPL_ST_INTERLEAVE(short, vec_short8) +VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4) +VSX_IMPL_ST_INTERLEAVE(int, vec_int4) +VSX_IMPL_ST_INTERLEAVE(float, vec_float4) + +// 2 and 4 channels deinterleave for 16 lanes +#define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \ +{ \ + Tvec v0 = vsx_ld(0, ptr); \ + Tvec v1 = vsx_ld(16, ptr); \ + a = vec_mergesqe(v0, v1); \ + b = vec_mergesqo(v0, v1); \ +} \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \ + Tvec& c, Tvec& d) \ +{ \ + Tvec v0 = vsx_ld(0, ptr); \ + Tvec v1 = vsx_ld(16, ptr); \ + Tvec v2 = vsx_ld(32, ptr); \ + Tvec v3 = vsx_ld(48, ptr); \ + Tvec m0 = vec_mergesqe(v0, v1); \ + Tvec m1 = vec_mergesqe(v2, v3); \ + a = vec_mergesqe(m0, m1); \ + c = vec_mergesqo(m0, m1); \ + m0 = vec_mergesqo(v0, v1); \ + m1 = vec_mergesqo(v2, v3); \ + b = vec_mergesqe(m0, m1); \ + d = vec_mergesqo(m0, m1); \ +} +VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16) +VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16) + +// 2 and 4 channels deinterleave for 8 lanes +#define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec) \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \ +{ \ + Tvec v0 = vsx_ld(0, ptr); \ + Tvec v1 = vsx_ld(8, ptr); \ + a = vec_mergesqe(v0, v1); \ + b = vec_mergesqo(v0, v1); \ +} \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \ + Tvec& c, Tvec& d) \ +{ \ + Tvec v0 = vsx_ld(0, ptr); \ + Tvec v1 = vsx_ld(8, ptr); \ + Tvec m0 = vec_mergeh(v0, v1); \ + Tvec m1 = vec_mergel(v0, v1); \ + Tvec ab0 = vec_mergeh(m0, m1); \ + Tvec cd0 = vec_mergel(m0, m1); \ + v0 = vsx_ld(16, ptr); \ + v1 = vsx_ld(24, ptr); \ + m0 = vec_mergeh(v0, v1); \ + m1 = vec_mergel(v0, v1); \ + Tvec ab1 = vec_mergeh(m0, m1); \ + Tvec cd1 = vec_mergel(m0, m1); \ + a = vec_mergesqh(ab0, ab1); \ + b = vec_mergesql(ab0, ab1); \ + c = vec_mergesqh(cd0, cd1); \ + d = vec_mergesql(cd0, cd1); \ +} +VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8) +VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8) + +// 2 and 4 channels deinterleave for 4 lanes +#define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \ +{ \ + a = vsx_ld(0, ptr); \ + b = vsx_ld(4, ptr); \ + Tvec m0 = vec_mergeh(a, b); \ + Tvec m1 = vec_mergel(a, b); \ + a = vec_mergeh(m0, m1); \ + b = vec_mergel(m0, m1); \ +} \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \ + Tvec& c, Tvec& d) \ +{ \ + Tvec v0 = vsx_ld(0, ptr); \ + Tvec v1 = vsx_ld(4, ptr); \ + Tvec v2 = vsx_ld(8, ptr); \ + Tvec v3 = vsx_ld(12, ptr); \ + Tvec m0 = vec_mergeh(v0, v2); \ + Tvec m1 = vec_mergeh(v1, v3); \ + a = vec_mergeh(m0, m1); \ + b = vec_mergel(m0, m1); \ + m0 = vec_mergel(v0, v2); \ + m1 = vec_mergel(v1, v3); \ + c = vec_mergeh(m0, m1); \ + d = vec_mergel(m0, m1); \ +} +VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4) +VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4) +VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4) + +// 2 and 4 channels interleave and deinterleave for 2 lanes +#define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func) \ +FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \ +{ \ + st_func(vec_mergeh(a, b), 0, ptr); \ + st_func(vec_mergel(a, b), 2, ptr); \ +} \ +FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \ + const Tvec& c, const Tvec& d, Tp* ptr) \ +{ \ + st_func(vec_mergeh(a, b), 0, ptr); \ + st_func(vec_mergel(a, b), 2, ptr); \ + st_func(vec_mergeh(c, d), 4, ptr); \ + st_func(vec_mergel(c, d), 6, ptr); \ +} \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \ +{ \ + Tvec m0 = ld_func(0, ptr); \ + Tvec m1 = ld_func(2, ptr); \ + a = vec_mergeh(m0, m1); \ + b = vec_mergel(m0, m1); \ +} \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \ + Tvec& c, Tvec& d) \ +{ \ + Tvec v0 = ld_func(0, ptr); \ + Tvec v1 = ld_func(2, ptr); \ + a = vec_mergeh(v0, v1); \ + b = vec_mergel(v0, v1); \ + v0 = ld_func(4, ptr); \ + v1 = ld_func(6, ptr); \ + c = vec_mergeh(v0, v1); \ + d = vec_mergel(v0, v1); \ +} +VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2) +VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2) +VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st) + +/* 3 channels */ +#define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \ +FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \ + const Tvec& c, Tp* ptr) \ +{ \ + static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5}; \ + static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15}; \ + vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \ + static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26}; \ + static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15}; \ + vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr); \ + static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0}; \ + static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31}; \ + vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr); \ +} \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \ +{ \ + Tvec v1 = vsx_ld(0, ptr); \ + Tvec v2 = vsx_ld(16, ptr); \ + Tvec v3 = vsx_ld(32, ptr); \ + static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0}; \ + static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}; \ + a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \ + static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \ + static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}; \ + b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \ + static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0}; \ + static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31}; \ + c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \ +} +VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16) +VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16) + +#define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec) \ +FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \ + const Tvec& c, Tp* ptr) \ +{ \ + static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21}; \ + static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15}; \ + vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \ + static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11}; \ + static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15}; \ + vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr); \ + static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0}; \ + static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31}; \ + vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr); \ +} \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \ +{ \ + Tvec v1 = vsx_ld(0, ptr); \ + Tvec v2 = vsx_ld(8, ptr); \ + Tvec v3 = vsx_ld(16, ptr); \ + static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \ + static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27}; \ + a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \ + static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0}; \ + static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29}; \ + b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \ + static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \ + static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31}; \ + c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \ +} +VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8) +VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8) + +#define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \ +FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \ + const Tvec& c, Tp* ptr) \ +{ \ + Tvec hbc = vec_mergeh(b, c); \ + static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; \ + vsx_st(vec_perm(a, hbc, ahbc), 0, ptr); \ + Tvec lab = vec_mergel(a, b); \ + vsx_st(vec_sld(lab, hbc, 8), 4, ptr); \ + static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\ + vsx_st(vec_perm(c, lab, clab), 8, ptr); \ +} \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \ +{ \ + Tvec v1 = vsx_ld(0, ptr); \ + Tvec v2 = vsx_ld(4, ptr); \ + Tvec v3 = vsx_ld(8, ptr); \ + static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31}; \ + a = vec_perm(v1, vec_sld(v3, v2, 8), flp); \ + static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19}; \ + b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); \ + c = vec_perm(vec_sld(v2, v1, 8), v3, flp); \ +} +VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4) +VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4) +VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4) + +#define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \ +FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \ + const Tvec& c, Tp* ptr) \ +{ \ + st_func(vec_mergeh(a, b), 0, ptr); \ + st_func(vec_permi(c, a, 1), 2, ptr); \ + st_func(vec_mergel(b, c), 4, ptr); \ +} \ +FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \ + Tvec& b, Tvec& c) \ +{ \ + Tvec v1 = ld_func(0, ptr); \ + Tvec v2 = ld_func(2, ptr); \ + Tvec v3 = ld_func(4, ptr); \ + a = vec_permi(v1, v2, 1); \ + b = vec_permi(v1, v3, 2); \ + c = vec_permi(v2, v3, 1); \ +} +VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2) +VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2) +VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st) + +#endif // CV_VSX + +//! @} + +#endif // OPENCV_HAL_VSX_UTILS_HPP diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp index bb4ae6cf18..9fa87f1799 100644 --- a/modules/core/src/precomp.hpp +++ b/modules/core/src/precomp.hpp @@ -85,7 +85,7 @@ #include "opencv2/core/hal/intrin.hpp" #include "opencv2/core/sse_utils.hpp" #include "opencv2/core/neon_utils.hpp" - +#include "opencv2/core/vsx_utils.hpp" #include "arithm_core.hpp" #include "hal_replacement.hpp" diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp index 0a4110ec12..ca811a0277 100644 --- a/modules/core/src/system.cpp +++ b/modules/core/src/system.cpp @@ -80,6 +80,18 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex(); # include #endif +#ifndef __VSX__ +# if defined __PPC64__ && defined __linux__ +# include "sys/auxv.h" +# ifndef AT_HWCAP2 +# define AT_HWCAP2 26 +# endif +# ifndef PPC_FEATURE2_ARCH_2_07 +# define PPC_FEATURE2_ARCH_2_07 0x80000000 +# endif +# endif +#endif + #if defined _WIN32 || defined WINCE #ifndef _WIN32_WINNT // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?) #define _WIN32_WINNT 0x0400 // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx @@ -295,6 +307,8 @@ struct HWFeatures g_hwFeatureNames[CPU_AVX_512VL] = "AVX512VL"; g_hwFeatureNames[CPU_NEON] = "NEON"; + + g_hwFeatureNames[CPU_VSX] = "VSX"; } void initialize(void) @@ -504,6 +518,16 @@ struct HWFeatures #endif #endif + #ifdef __VSX__ + have[CV_CPU_VSX] = true; + #elif (defined __PPC64__ && defined __linux__) + uint64 hwcaps = getauxval(AT_HWCAP); + uint64 hwcap2 = getauxval(AT_HWCAP2); + have[CV_CPU_VSX] = (hwcaps & PPC_FEATURE_PPC_LE && hwcaps & PPC_FEATURE_HAS_VSX && hwcap2 & PPC_FEATURE2_ARCH_2_07); + #else + have[CV_CPU_VSX] = false; + #endif + int baseline_features[] = { CV_CPU_BASELINE_FEATURES }; if (!checkFeatures(baseline_features, sizeof(baseline_features) / sizeof(baseline_features[0]))) { diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index be9bcf5549..0ba6235e35 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -3091,6 +3091,9 @@ void printVersionInfo(bool useStdOut) #if CV_FP16 if (checkHardwareSupport(CV_CPU_FP16)) cpu_features += " fp16"; #endif +#if CV_VSX + if (checkHardwareSupport(CV_CPU_VSX)) cpu_features += " VSX"; +#endif cpu_features.erase(0, 1); // erase initial space