Added support for VSX

8 years ago · d077778074
parent 2a2537725a
commit d077778074
13 changed files with 1042 additions and 3 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -298,6 +298,7 @@ OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add
 OCV_OPTION(ENABLE_COVERAGE            "Enable coverage collection with  GCov"                    OFF  IF CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CMAKE_COMPILER_IS_GNUCXX AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX) )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
+OCV_OPTION(ENABLE_VSX                 "Enable POWER8 and above VSX (64-bit little-endian)"       ON   IF (CMAKE_COMPILER_IS_GNUCXX AND PPC64LE) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 (NEON OR ANDROID_ARM_NEON OR AARCH64) IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR AARCH64 OR IOS) )
 OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF CMAKE_COMPILER_IS_GNUCXX AND (ARM OR AARCH64 OR IOS) )
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -28,6 +28,7 @@

 set(CPU_ALL_OPTIMIZATIONS "SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3") # without AVX512
 list(APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16)
+list(APPEND CPU_ALL_OPTIMIZATIONS VSX)
 list(REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS)

 ocv_update(CPU_VFPV3_FEATURE_ALIAS "")
@ -79,6 +80,7 @@ ocv_optimization_process_obsolete_option(ENABLE_FMA3 FMA3 ON)
 ocv_optimization_process_obsolete_option(ENABLE_VFPV3 VFPV3 OFF)
 ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF)

+ocv_optimization_process_obsolete_option(ENABLE_VSX VSX OFF)

 macro(ocv_is_optimization_in_list resultvar check_opt)
  set(__checked "")
@ -266,6 +268,15 @@ elseif(ARM OR AARCH64)
    ocv_update(CPU_FP16_IMPLIES "NEON")
    set(CPU_BASELINE "NEON;FP16" CACHE STRING "${HELP_CPU_BASELINE}")
  endif()
+elseif(PPC64LE)
+  ocv_update(CPU_KNOWN_OPTIMIZATIONS "VSX")
+  ocv_update(CPU_VSX_TEST_FILE "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx.cpp")
+
+  if(CMAKE_COMPILER_IS_CLANGCXX AND (NOT ${CMAKE_CXX_COMPILER} MATCHES "xlc"))
+    ocv_update(CPU_VSX_FLAGS_ON "-mvsx -maltivec")
+  else()
+    ocv_update(CPU_VSX_FLAGS_ON "-mcpu=power8")
+  endif()
 endif()

 # Helper values for cmake-gui
--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@ -72,6 +72,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
  set(ARM 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
  set(AARCH64 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^ppc64le.*|PPC64LE.*")
+  set(PPC64LE 1)
 endif()

 # Workaround for 32-bit operating systems on 64-bit x86_64 processor
--- a/cmake/OpenCVPackaging.cmake
+++ b/cmake/OpenCVPackaging.cmake
@ -31,6 +31,9 @@ elseif(ARM)
 elseif(AARCH64)
  set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "arm64")
  set(CPACK_RPM_PACKAGE_ARCHITECTURE "aarch64")
+elseif(PPC64LE)
+  set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "ppc64el")
+  set(CPACK_RPM_PACKAGE_ARCHITECTURE "ppc64le")
 else()
  set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR})
  set(CPACK_RPM_PACKAGE_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR})
@ -164,4 +167,4 @@ endif(NOT OPENCV_CUSTOM_PACKAGE_INFO)

 include(CPack)

-ENDif(EXISTS "${CMAKE_ROOT}/Modules/CPack.cmake")
+ENDif(EXISTS "${CMAKE_ROOT}/Modules/CPack.cmake")
--- a/cmake/checks/cpu_vsx.cpp
+++ b/cmake/checks/cpu_vsx.cpp
@ -0,0 +1,12 @@
+# if defined(__VSX__)
+#   include <altivec.h>
+# else
+#   error "VSX is not supported"
+# endif
+
+int main()
+{
+    __vector float testF = vec_splats(0.f);
+    testF = vec_madd(testF, testF, testF);
+    return 0;
+}
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@ -740,5 +740,6 @@ CV_EXPORTS_W void setUseIPP_NE(bool flag);
 } // cv

 #include "opencv2/core/neon_utils.hpp"
+#include "opencv2/core/vsx_utils.hpp"

 #endif //OPENCV_CORE_BASE_HPP
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -99,6 +99,14 @@
 #  include <arm_neon.h>
 #endif

+#if defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
+#  include <altivec.h>
+#  undef vector
+#  undef pixel
+#  undef bool
+#  define CV_VSX 1
+#endif
+
 #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__

 #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
@ -135,6 +143,12 @@ struct VZeroUpperGuard {
 #elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
 #  include <arm_neon.h>
 #  define CV_NEON 1
+#elif defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
+#  include <altivec.h>
+#  undef vector
+#  undef pixel
+#  undef bool
+#  define CV_VSX 1
 #endif

 #endif // !__OPENCV_BUILD && !__CUDACC (Compatibility code)
@ -208,3 +222,7 @@ struct VZeroUpperGuard {
 #ifndef CV_NEON
 #  define CV_NEON 0
 #endif
+
+#ifndef CV_VSX
+#  define CV_VSX 0
+#endif
--- a/modules/core/include/opencv2/core/cv_cpu_helper.h
+++ b/modules/core/include/opencv2/core/cv_cpu_helper.h
@ -180,5 +180,20 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...)  CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))

+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX
+#  define CV_TRY_VSX 1
+#  define CV_CPU_HAS_SUPPORT_VSX 1
+#  define CV_CPU_CALL_VSX(fn, args) return (opt_VSX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX
+#  define CV_TRY_VSX 1
+#  define CV_CPU_HAS_SUPPORT_VSX (cv::checkHardwareSupport(CV_CPU_VSX))
+#  define CV_CPU_CALL_VSX(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args)
+#else
+#  define CV_TRY_VSX 0
+#  define CV_CPU_HAS_SUPPORT_VSX 0
+#  define CV_CPU_CALL_VSX(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_VSX(fn, args, mode, ...)  CV_CPU_CALL_VSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
 #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...)  CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -153,6 +153,8 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard

 #define CV_CPU_NEON   100

+#define CV_CPU_VSX 200
+
 // when adding to this list remember to update the following enum
 #define CV_HARDWARE_MAX_FEATURE 255

@ -182,7 +184,9 @@ enum CpuFeatures {
    CPU_AVX_512VBMI     = 20,
    CPU_AVX_512VL       = 21,

-    CPU_NEON            = 100
+    CPU_NEON            = 100,
+
+    CPU_VSX             = 200
 };


--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@ -0,0 +1,945 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_VSX_UTILS_HPP
+#define OPENCV_HAL_VSX_UTILS_HPP
+
+#include "opencv2/core/cvdef.h"
+
+//! @addtogroup core_utils_vsx
+//! @{
+#if CV_VSX
+
+#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
+
+#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2)   \
+FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); }
+
+#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2)   \
+FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
+
+#define VSX_IMPL_PERM(rt, fnm, ...)            \
+FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \
+    { static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
+
+#define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
+#define __VSX_S8__(c, v)  (c){v, v, v, v, v, v, v, v}
+#define __VSX_S4__(c, v)  (c){v, v, v, v}
+#define __VSX_S2__(c, v)  (c){v, v}
+
+typedef __vector unsigned char vec_uchar16;
+#define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
+#define vec_uchar16_sp(c)    (__VSX_S16__(vec_uchar16, c))
+#define vec_uchar16_c(v)     ((vec_uchar16)(v))
+#define vec_uchar16_mx       vec_uchar16_sp(0xFF)
+#define vec_uchar16_mn       vec_uchar16_sp(0)
+#define vec_uchar16_z        vec_uchar16_mn
+
+typedef __vector signed char vec_char16;
+#define vec_char16_set(...) (vec_char16){__VA_ARGS__}
+#define vec_char16_sp(c)    (__VSX_S16__(vec_char16, c))
+#define vec_char16_c(v)     ((vec_char16)(v))
+#define vec_char16_mx       vec_char16_sp(0x7F)
+#define vec_char16_mn       vec_char16_sp(-0x7F-1)
+#define vec_char16_z        vec_char16_sp(0)
+
+typedef __vector unsigned short vec_ushort8;
+#define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
+#define vec_ushort8_sp(c)    (__VSX_S8__(vec_ushort8, c))
+#define vec_ushort8_c(v)     ((vec_ushort8)(v))
+#define vec_ushort8_mx       vec_ushort8_sp(0xFFFF)
+#define vec_ushort8_mn       vec_ushort8_sp(0)
+#define vec_ushort8_z        vec_ushort8_mn
+
+typedef __vector signed short vec_short8;
+#define vec_short8_set(...) (vec_short8){__VA_ARGS__}
+#define vec_short8_sp(c)    (__VSX_S8__(vec_short8, c))
+#define vec_short8_c(v)     ((vec_short8)(v))
+#define vec_short8_mx       vec_short8_sp(0x7FFF)
+#define vec_short8_mn       vec_short8_sp(-0x7FFF-1)
+#define vec_short8_z        vec_short8_sp(0)
+
+typedef __vector unsigned int vec_uint4;
+#define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
+#define vec_uint4_sp(c)    (__VSX_S4__(vec_uint4, c))
+#define vec_uint4_c(v)     ((vec_uint4)(v))
+#define vec_uint4_mx       vec_uint4_sp(0xFFFFFFFFU)
+#define vec_uint4_mn       vec_uint4_sp(0)
+#define vec_uint4_z        vec_uint4_mn
+
+typedef __vector signed int vec_int4;
+#define vec_int4_set(...)  (vec_int4){__VA_ARGS__}
+#define vec_int4_sp(c)     (__VSX_S4__(vec_int4, c))
+#define vec_int4_c(v)      ((vec_int4)(v))
+#define vec_int4_mx        vec_int4_sp(0x7FFFFFFF)
+#define vec_int4_mn        vec_int4_sp(-0x7FFFFFFF-1)
+#define vec_int4_z         vec_int4_sp(0)
+
+typedef __vector float vec_float4;
+#define vec_float4_set(...)  (vec_float4){__VA_ARGS__}
+#define vec_float4_sp(c)     (__VSX_S4__(vec_float4, c))
+#define vec_float4_c(v)      ((vec_float4)(v))
+#define vec_float4_mx        vec_float4_sp(3.40282347E+38F)
+#define vec_float4_mn        vec_float4_sp(1.17549435E-38F)
+#define vec_float4_z         vec_float4_sp(0)
+
+typedef __vector unsigned long long vec_udword2;
+#define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
+#define vec_udword2_sp(c)    (__VSX_S2__(vec_udword2, c))
+#define vec_udword2_c(v)     ((vec_udword2)(v))
+#define vec_udword2_mx       vec_udword2_sp(18446744073709551615ULL)
+#define vec_udword2_mn       vec_udword2_sp(0)
+#define vec_udword2_z        vec_udword2_mn
+
+typedef __vector signed long long vec_dword2;
+#define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
+#define vec_dword2_sp(c)    (__VSX_S2__(vec_dword2, c))
+#define vec_dword2_c(v)     ((vec_dword2)(v))
+#define vec_dword2_mx       vec_dword2_sp(9223372036854775807LL)
+#define vec_dword2_mn       vec_dword2_sp(-9223372036854775807LL-1)
+#define vec_dword2_z        vec_dword2_sp(0)
+
+typedef  __vector double vec_double2;
+#define vec_double2_set(...) (vec_double2){__VA_ARGS__}
+#define vec_double2_c(v)     ((vec_double2)(v))
+#define vec_double2_sp(c)    (__VSX_S2__(vec_double2, c))
+#define vec_double2_mx       vec_double2_sp(1.7976931348623157E+308)
+#define vec_double2_mn       vec_double2_sp(2.2250738585072014E-308)
+#define vec_double2_z        vec_double2_sp(0)
+
+#define vec_bchar16	          __vector __bool char
+#define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
+#define vec_bchar16_c(v)     ((vec_bchar16)(v))
+#define vec_bchar16_f        (__VSX_S16__(vec_bchar16, 0))
+#define vec_bchar16_t        (__VSX_S16__(vec_bchar16, 1))
+
+#define vec_bshort8	          __vector __bool short
+#define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
+#define vec_bshort8_c(v)     ((vec_bshort8)(v))
+#define vec_bshort8_f        (__VSX_S8__(vec_bshort8, 0))
+#define vec_bshort8_t        (__VSX_S8__(vec_bshort8, 1))
+
+#define vec_bint4             __vector __bool int
+#define vec_bint4_set(...)   (vec_bint4){__VA_ARGS__}
+#define vec_bint4_c(v)       ((vec_bint4)(v))
+#define vec_bint4_f          (__VSX_S4__(vec_bint4, 0))
+#define vec_bint4_t          (__VSX_S4__(vec_bint4, 1))
+
+#define vec_bdword2	           __vector __bool long long
+#define vec_bdword2_set(...)  (vec_bdword2){__VA_ARGS__}
+#define vec_bdword2_c(v)      ((vec_bdword2)(v))
+#define vec_bdword2_f         (__VSX_S2__(vec_bdword2, 0))
+#define vec_bdword2_t         (__VSX_S2__(vec_bdword2, 1))
+
+/*
+ * GCC VSX compatibility
+**/
+#if defined(__GNUG__) && !defined(__IBMCPP__) && !defined(__clang__)
+
+// inline asm helper
+#define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \
+FORCE_INLINE(rt) fnm(const rg& a)                \
+    { rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "="#rto (rs) : #rgo (a)); return rs; }
+
+#define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
+FORCE_INLINE(rt) fnm(const rg& a)       \
+    { rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; }
+
+#define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm)     \
+FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
+    { rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; }
+
+#define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
+
+#if __GNUG__ < 7
+/* up to GCC 6 vec_mul only supports precisions and llong */
+#   ifdef vec_mul
+#       undef vec_mul
+#   endif
+/*
+ * there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07,
+ * XLC Implement it by using instruction "multiply even", "multiply oden" and "permute"
+ * todo: Do I need to support 8-bit ?
+**/
+#   define VSX_IMPL_MULH(Tvec, Tcast)                                               \
+    FORCE_INLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b)                        \
+    {                                                                               \
+        static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21,           \
+                                              8, 9, 24, 25, 12, 13, 28, 29};        \
+        return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm);   \
+    }
+    VSX_IMPL_MULH(vec_short8, vec_short8_c)
+    VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c)
+    /* vmuluwm can be used for unsigned or signed integers, that's what they said */
+    VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
+    VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
+    /* redirect to GCC builtin vec_mul, since it already supports precisions and llong */
+    VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
+#endif // __GNUG__ < 7
+
+#if __GNUG__ < 6
+/*
+ * Instruction "compare greater than or equal" in ISA 2.07 only supports single
+ * and double precision.
+ * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR.
+**/
+#   ifdef vec_cmpge
+#       undef vec_cmpge
+#   endif
+#   ifdef vec_cmple
+#       undef vec_cmple
+#   endif
+#   define vec_cmple(a, b) vec_cmpge(b, a)
+#   define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
+    VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
+
+    VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
+    /* redirect to GCC builtin cmpge, since it already supports precisions */
+    VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
+    VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
+
+// up to gcc5 vec_nor doesn't support bool long long
+#   undef vec_nor
+template<typename T>
+VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
+
+FORCE_INLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
+{ return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
+#endif // __GNUG__ < 6
+
+// vector population count
+#ifndef vec_popcnt
+    VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcnt)
+    VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcnt)
+    VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcnt)
+    VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcnt)
+    VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcnt)
+    VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcnt)
+    VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcnt)
+    VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcnt)
+#endif // vec_popcnt
+
+#if __GNUG__ < 5
+// vec_xxpermdi in gcc4 missing little-endian supports just like clang
+#   define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
+// vec_packs doesn't support double words in gcc4
+# undef vec_packs
+VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
+VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
+VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
+VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)
+VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs)
+VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
+#else
+#   define vec_permi vec_xxpermdi
+#endif
+
+// converts between single and double-precision
+#ifndef vec_cvf
+    VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
+    FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
+    { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); }
+#endif
+
+// converts 32 and 64 bit integers to double-precision
+#ifndef vec_ctd
+#   define vec_ctd(a, b) __vec_ctd(a)
+    VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, __vec_ctd)
+    VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, __vec_ctd)
+    VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, __vec_ctd)
+    VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, __vec_ctd)
+#endif
+
+// shift left double by word immediate
+#ifndef vec_sldw
+#   define vec_sldw __builtin_vsx_xxsldwi
+#endif
+
+// just in case if GCC doesn't define it
+#ifndef vec_xl
+#   define vec_xl vec_vsx_ld
+#   define vec_xst vec_vsx_st
+#endif
+
+#endif // GCC VSX compatibility
+
+/*
+ * CLANG VSX compatibility
+**/
+#if defined(__clang__) && !defined(__IBMCPP__)
+
+/*
+ * CLANG doesn't support %x<n> in the inline asm template which fixes register number
+ * when using any of the register constraints wa, wd, wf
+ *
+ * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
+ * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
+ *
+ * So we're not able to use inline asm and only use built-in functions that CLANG supports
+*/
+
+#if __clang_major__ < 5
+// implement vec_permi in a dirty way
+#   define VSX_IMPL_CLANG_4_PERMI(Tvec)                                                 \
+    FORCE_INLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c)   \
+    {                                                                                   \
+        switch (c)                                                                      \
+        {                                                                               \
+        case 0:                                                                         \
+            return vec_mergeh(a, b);                                                    \
+        case 1:                                                                         \
+            return vec_mergel(vec_mergeh(a, a), b);                                     \
+        case 2:                                                                         \
+            return vec_mergeh(vec_mergel(a, a), b);                                     \
+        default:                                                                        \
+            return vec_mergel(a, b);                                                    \
+        }                                                                               \
+    }
+    VSX_IMPL_CLANG_4_PERMI(vec_udword2)
+    VSX_IMPL_CLANG_4_PERMI(vec_dword2)
+    VSX_IMPL_CLANG_4_PERMI(vec_double2)
+
+// vec_xxsldwi is missing in clang 4
+#   define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
+#else
+// vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
+#   define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
+#endif // __clang_major__ < 5
+
+// shift left double by word immediate
+#ifndef vec_sldw
+#   define vec_sldw vec_xxsldwi
+#endif
+
+/* converts between single and double precision */
+#ifndef vec_cvf
+    VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
+    FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
+    { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); }
+#endif
+
+/* converts 32 and 64 bit integers to double-precision */
+#ifndef vec_ctd
+#   define vec_ctd(a, b) __vec_ctd(a)
+    VSX_REDIRECT_1RG(vec_double2, vec_int4, __vec_ctd, __builtin_vsx_xvcvsxwdp)
+    VSX_REDIRECT_1RG(vec_double2, vec_uint4, __vec_ctd, __builtin_vsx_xvcvuxwdp)
+    // implement vec_ctd for double word in a dirty way since we are missing builtin xvcvsxddp, xvcvuxddp
+    // please try to avoid using it for double words
+    FORCE_INLINE(vec_double2) __vec_ctd(const vec_dword2& a)
+    { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); }
+    FORCE_INLINE(vec_double2) __vec_ctd(const vec_udword2& a)
+    { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); }
+#endif
+
+// Implement vec_rsqrt since clang only supports vec_rsqrte
+#ifndef vec_rsqrt
+    FORCE_INLINE(vec_float4) vec_rsqrt(const vec_float4& a)
+    { return vec_div(vec_float4_sp(1), vec_sqrt(a)); }
+
+    FORCE_INLINE(vec_double2) vec_rsqrt(const vec_double2& a)
+    { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
+#endif
+
+
+/*
+ * __builtin_altivec_vctsxs in clang 5 and 6 causes ambiguous which used by vec_cts
+ * so we just redefine it and cast it
+*/
+#if __clang_major__ > 4
+#   undef vec_cts
+#   define vec_cts(__a, __b)                                                            \
+        _Generic((__a), vector float                                                    \
+           : (vector signed int)__builtin_altivec_vctsxs((__a), (__b)), vector double   \
+           : __extension__({                                                            \
+             vector double __ret =                                                      \
+                 (__a) *                                                                \
+                 (vector double)(vector unsigned long long)((0x3ffULL + (__b))          \
+                                                            << 52);                     \
+             __builtin_convertvector(__ret, vector signed long long);                   \
+           }))
+#endif // __clang_major__ > 4
+
+#endif // CLANG VSX compatibility
+
+/*
+ * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
+ * load and set using offset depend on the pointer type
+ *
+ * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
+ * load and set using offset depend on fixed bytes size
+ *
+ * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
+ * so we are using vec_vsx_ld, vec_vsx_st instead
+*/
+
+#if defined(__clang__) && !defined(__IBMCPP__)
+#   define vsx_ldf  vec_vsx_ld
+#   define vsx_stf  vec_vsx_st
+#else // GCC , XLC
+#   define vsx_ldf  vec_xl
+#   define vsx_stf  vec_xst
+#endif
+
+#define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
+#define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
+#define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
+
+/*
+ * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words
+ * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long
+ * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses
+ *
+ * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
+*/
+#if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
+    FORCE_INLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
+    { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); }
+
+    FORCE_INLINE(vec_dword2) vsx_ld2(long o, const int64* p)
+    { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); }
+
+    FORCE_INLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
+    { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); }
+
+    FORCE_INLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
+    { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); }
+#else // XLC
+    FORCE_INLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
+    { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); }
+
+    FORCE_INLINE(vec_dword2) vsx_ld2(long o, const int64* p)
+    { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); }
+
+    FORCE_INLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
+    { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); }
+
+    FORCE_INLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
+    { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
+#endif
+
+#if defined(__clang__) || defined(__IBMCPP__)
+    // gcc can find his way in casting log int and XLC, CLANG ambiguous
+    FORCE_INLINE(vec_udword2) vec_splats(uint64 v)
+    { return vec_splats((unsigned long long) v); }
+
+    FORCE_INLINE(vec_dword2) vec_splats(int64 v)
+    { return vec_splats((long long) v); }
+#endif
+
+// Implement store vector bool char for XLC
+#if defined(__IBMCPP__) && defined(__clang__)
+    FORCE_INLINE(void) vec_xst(const vec_bchar16 &vec, long o, uchar* p)
+    { vec_xst(vec_uchar16_c(vec), VSX_OFFSET(o, p), p); }
+#endif
+
+// Working around vec_popcnt compatibility
+/*
+ * vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
+ *
+ * use vec_popcntu instead to deal with it
+*/
+#if defined(__clang__) && !defined(__IBMCPP__)
+#   define VSX_IMPL_CLANG_POPCNTU(Tvec, Tvec2, ucast)   \
+    FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a)      \
+    { return ucast(vec_popcnt(a)); }
+
+    VSX_IMPL_CLANG_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
+    VSX_IMPL_CLANG_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
+    VSX_IMPL_CLANG_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
+    // redirect unsigned types
+    VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
+    VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
+    VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
+#else
+#   define vec_popcntu vec_popcnt
+#endif
+
+// Working around vec_cts compatibility
+/*
+ * vec_cts in gcc and clang converts single-precision to signed fixed-point word
+ * and from double-precision to signed doubleword, also there's no implement for vec_ctsl
+ *
+ * vec_cts in xlc converts single and double precision to signed fixed-point word
+ * and xlc has vec_ctsl which converts single and double precision to signed doubleword
+ *
+ * so to deal with this situation, use vec_cts only if you want to convert single-precision to signed fixed-point word
+ * and use vec_ctsl when you want to convert double-precision to signed doubleword
+ *
+ * Also we implemented vec_ctsw(a) to convert double-precision to signed fixed-point word
+*/
+
+// converts double-precision to signed doubleword for GCC and CLANG
+#if !defined(vec_ctsl) && !defined(__IBMCPP__) && (defined(__GNUG__) || defined(__clang__))
+// GCC4 has incorrect results in convert to signed doubleword
+#   if !defined(__clang__) && __GNUG__ < 5
+#       define vec_ctsl(a, b) __vec_ctsl(a)
+        VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, __vec_ctsl)
+#   else // GCC > 4 , CLANG
+#       define vec_ctsl vec_cts
+#   endif
+#endif
+
+// converts double-precision to signed fixed-point word
+#if defined(__IBMCPP__)
+#   define vec_ctsw(a) vec_cts(a, 0)
+#else // GCC, CLANG
+#   define vec_ctsw(a) vec_int4_c(__builtin_vsx_xvcvdpsxws(a))
+#endif
+
+// load 4 unsigned bytes into uint4 vector
+#define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
+
+// load 4 signed bytes into int4 vector
+#define vec_ld_bsw(p) vec_int4_set((p)[0], (p)[1], (p)[2], (p)[3])
+
+// load 4 unsigned bytes into float vector
+#define vec_ld_bps(p) vec_ctf(vec_ld_buw(p), 0)
+
+// Store lower 8 byte
+#define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
+
+// Store higher 8 byte
+#define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
+
+/*
+ * vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part
+ * vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part
+**/
+#if defined(__clang__) && !defined(__IBMCPP__)
+#   define __VSX_LOAD_L8(Tvec, p) (Tvec)((vec_udword2)*((uint64*)(p)))
+#else
+#   define __VSX_LOAD_L8(Tvec, p) *((Tvec*)(p))
+#endif
+
+#define VSX_IMPL_LOAD_L8(Tvec, Tp)                                              \
+FORCE_INLINE(Tvec) vec_ld_l8(const Tp *p)                                       \
+{ return __VSX_LOAD_L8(Tvec, p); }                                              \
+FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p)                                      \
+{                                                                               \
+    static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000};   \
+    return vec_and(vec_ld_l8(p), (Tvec)mask);                                   \
+}
+VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
+VSX_IMPL_LOAD_L8(vec_char16, schar)
+VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
+VSX_IMPL_LOAD_L8(vec_short8, short)
+VSX_IMPL_LOAD_L8(vec_uint4, uint)
+VSX_IMPL_LOAD_L8(vec_int4, int)
+VSX_IMPL_LOAD_L8(vec_float4, float)
+VSX_IMPL_LOAD_L8(vec_udword2, uint64)
+VSX_IMPL_LOAD_L8(vec_dword2, int64)
+VSX_IMPL_LOAD_L8(vec_double2, double)
+
+// logical not
+#define vec_not(a) vec_nor(a, a)
+
+// power9 yaya
+// not equal
+#ifndef vec_cmpne
+#   define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
+#endif
+
+// absoulte difference
+#ifndef vec_absd
+#   define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
+#endif
+
+/*
+ * Implement vec_unpacklu and vec_unpackhu
+ * since vec_unpackl, vec_unpackh only support signed integers
+**/
+#define VSX_IMPL_UNPACKU(rt, rg, zero)                  \
+FORCE_INLINE(rt) vec_unpacklu(const rg& a)              \
+{ return reinterpret_cast<rt>(vec_mergel(a, zero)); }   \
+FORCE_INLINE(rt) vec_unpackhu(const rg& a)              \
+{ return reinterpret_cast<rt>(vec_mergeh(a, zero));  }
+
+VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
+VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z)
+VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)
+
+/*
+ * Implement vec_mergesqe and vec_mergesqo
+ * Merges the sequence values of even and odd elements of two vectors
+*/
+// 16
+#define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+#define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
+VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
+VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe)
+VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo)
+// 8
+#define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+#define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
+VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
+VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe)
+VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo)
+// 4
+#define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+#define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo)
+VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo)
+VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
+// 2
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
+VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
+
+/*
+ * Implement vec_mergesqh and vec_mergesql
+ * Merges the sequence most and least significant halves of two vectors
+*/
+#define VSX_IMPL_MERGESQHL(Tvec)                                    \
+FORCE_INLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b)       \
+{ return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); }    \
+FORCE_INLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b)       \
+{ return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
+VSX_IMPL_MERGESQHL(vec_uchar16)
+VSX_IMPL_MERGESQHL(vec_char16)
+VSX_IMPL_MERGESQHL(vec_ushort8)
+VSX_IMPL_MERGESQHL(vec_short8)
+VSX_IMPL_MERGESQHL(vec_uint4)
+VSX_IMPL_MERGESQHL(vec_int4)
+VSX_IMPL_MERGESQHL(vec_float4)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
+VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel)
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
+
+
+// 2 and 4 channels interleave for all types except 2 lanes
+#define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec)                                    \
+FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
+{                                                                           \
+    vsx_stf(vec_mergeh(a, b), 0, ptr);                                      \
+    vsx_stf(vec_mergel(a, b), 16, ptr);                                     \
+}                                                                           \
+FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,          \
+                                     const Tvec& c, const Tvec& d, Tp* ptr) \
+{                                                                           \
+    Tvec ac = vec_mergeh(a, c);                                             \
+    Tvec bd = vec_mergeh(b, d);                                             \
+    vsx_stf(vec_mergeh(ac, bd), 0, ptr);                                    \
+    vsx_stf(vec_mergel(ac, bd), 16, ptr);                                   \
+    ac = vec_mergel(a, c);                                                  \
+    bd = vec_mergel(b, d);                                                  \
+    vsx_stf(vec_mergeh(ac, bd), 32, ptr);                                   \
+    vsx_stf(vec_mergel(ac, bd), 48, ptr);                                   \
+}
+VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16)
+VSX_IMPL_ST_INTERLEAVE(schar, vec_char16)
+VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
+VSX_IMPL_ST_INTERLEAVE(short, vec_short8)
+VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4)
+VSX_IMPL_ST_INTERLEAVE(int, vec_int4)
+VSX_IMPL_ST_INTERLEAVE(float, vec_float4)
+
+// 2 and 4 channels deinterleave for 16 lanes
+#define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec)                                 \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)     \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(16, ptr);                                              \
+    a = vec_mergesqe(v0, v1);                                               \
+    b = vec_mergesqo(v0, v1);                                               \
+}                                                                           \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,     \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(16, ptr);                                              \
+    Tvec v2 = vsx_ld(32, ptr);                                              \
+    Tvec v3 = vsx_ld(48, ptr);                                              \
+    Tvec m0 = vec_mergesqe(v0, v1);                                         \
+    Tvec m1 = vec_mergesqe(v2, v3);                                         \
+    a = vec_mergesqe(m0, m1);                                               \
+    c = vec_mergesqo(m0, m1);                                               \
+    m0 = vec_mergesqo(v0, v1);                                              \
+    m1 = vec_mergesqo(v2, v3);                                              \
+    b = vec_mergesqe(m0, m1);                                               \
+    d = vec_mergesqo(m0, m1);                                               \
+}
+VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
+VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
+
+// 2 and 4 channels deinterleave for 8 lanes
+#define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec)                                \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)     \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(8, ptr);                                               \
+    a = vec_mergesqe(v0, v1);                                               \
+    b = vec_mergesqo(v0, v1);                                               \
+}                                                                           \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,     \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(8, ptr);                                               \
+    Tvec m0 = vec_mergeh(v0, v1);                                           \
+    Tvec m1 = vec_mergel(v0, v1);                                           \
+    Tvec ab0 = vec_mergeh(m0, m1);                                          \
+    Tvec cd0 = vec_mergel(m0, m1);                                          \
+    v0 = vsx_ld(16, ptr);                                                   \
+    v1 = vsx_ld(24, ptr);                                                   \
+    m0 = vec_mergeh(v0, v1);                                                \
+    m1 = vec_mergel(v0, v1);                                                \
+    Tvec ab1 = vec_mergeh(m0, m1);                                          \
+    Tvec cd1 = vec_mergel(m0, m1);                                          \
+    a = vec_mergesqh(ab0, ab1);                                             \
+    b = vec_mergesql(ab0, ab1);                                             \
+    c = vec_mergesqh(cd0, cd1);                                             \
+    d = vec_mergesql(cd0, cd1);                                             \
+}
+VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
+VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8)
+
+// 2 and 4 channels deinterleave for 4 lanes
+#define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec)                                \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)     \
+{                                                                           \
+    a = vsx_ld(0, ptr);                                                     \
+    b = vsx_ld(4, ptr);                                                     \
+    Tvec m0 = vec_mergeh(a, b);                                             \
+    Tvec m1 = vec_mergel(a, b);                                             \
+    a = vec_mergeh(m0, m1);                                                 \
+    b = vec_mergel(m0, m1);                                                 \
+}                                                                           \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,     \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(4, ptr);                                               \
+    Tvec v2 = vsx_ld(8, ptr);                                               \
+    Tvec v3 = vsx_ld(12, ptr);                                              \
+    Tvec m0 = vec_mergeh(v0, v2);                                           \
+    Tvec m1 = vec_mergeh(v1, v3);                                           \
+    a = vec_mergeh(m0, m1);                                                 \
+    b = vec_mergel(m0, m1);                                                 \
+    m0 = vec_mergel(v0, v2);                                                \
+    m1 = vec_mergel(v1, v3);                                                \
+    c = vec_mergeh(m0, m1);                                                 \
+    d = vec_mergel(m0, m1);                                                 \
+}
+VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4)
+VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4)
+VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
+
+// 2 and 4 channels interleave and deinterleave for 2 lanes
+#define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func)             \
+FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
+{                                                                           \
+    st_func(vec_mergeh(a, b), 0, ptr);                                      \
+    st_func(vec_mergel(a, b), 2, ptr);                                      \
+}                                                                           \
+FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,          \
+                                     const Tvec& c, const Tvec& d, Tp* ptr) \
+{                                                                           \
+    st_func(vec_mergeh(a, b), 0, ptr);                                      \
+    st_func(vec_mergel(a, b), 2, ptr);                                      \
+    st_func(vec_mergeh(c, d), 4, ptr);                                      \
+    st_func(vec_mergel(c, d), 6, ptr);                                      \
+}                                                                           \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)     \
+{                                                                           \
+    Tvec m0 = ld_func(0, ptr);                                              \
+    Tvec m1 = ld_func(2, ptr);                                              \
+    a = vec_mergeh(m0, m1);                                                 \
+    b = vec_mergel(m0, m1);                                                 \
+}                                                                           \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,     \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = ld_func(0, ptr);                                              \
+    Tvec v1 = ld_func(2, ptr);                                              \
+    a = vec_mergeh(v0, v1);                                                 \
+    b = vec_mergel(v0, v1);                                                 \
+    v0 = ld_func(4, ptr);                                                   \
+    v1 = ld_func(6, ptr);                                                   \
+    c = vec_mergeh(v0, v1);                                                 \
+    d = vec_mergel(v0, v1);                                                 \
+}
+VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st)
+
+/* 3 channels */
+#define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec)                                                   \
+FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                \
+                                     const Tvec& c, Tp* ptr)                                      \
+{                                                                                                 \
+    static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5};         \
+    static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15};    \
+    vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                                       \
+    static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26};       \
+    static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15};    \
+    vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr);                                      \
+    static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0};    \
+    static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31};   \
+    vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr);                                      \
+}                                                                                                 \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                  \
+{                                                                                                 \
+    Tvec v1 = vsx_ld(0, ptr);                                                                     \
+    Tvec v2 = vsx_ld(16, ptr);                                                                    \
+    Tvec v3 = vsx_ld(32, ptr);                                                                    \
+    static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0};  \
+    static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29};  \
+    a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                                      \
+    static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
+    static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30};  \
+    b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                                      \
+    static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0};  \
+    static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31};  \
+    c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                                      \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
+VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
+
+#define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec)                                                    \
+FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                \
+                                     const Tvec& c, Tp* ptr)                                      \
+{                                                                                                 \
+    static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21};        \
+    static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15};     \
+    vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                                       \
+    static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11};        \
+    static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15};   \
+    vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr);                                       \
+    static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0};    \
+    static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31};   \
+    vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr);                                      \
+}                                                                                                 \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                  \
+{                                                                                                 \
+    Tvec v1 = vsx_ld(0, ptr);                                                                     \
+    Tvec v2 = vsx_ld(8, ptr);                                                                     \
+    Tvec v3 = vsx_ld(16, ptr);                                                                    \
+    static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
+    static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27};  \
+    a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                                      \
+    static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0};   \
+    static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29};  \
+    b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                                      \
+    static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
+    static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31};  \
+    c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                                      \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
+VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8)
+
+#define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec)                                                     \
+FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                 \
+                                     const Tvec& c, Tp* ptr)                                       \
+{                                                                                                  \
+    Tvec hbc = vec_mergeh(b, c);                                                                   \
+    static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7};      \
+    vsx_st(vec_perm(a, hbc, ahbc), 0, ptr);                                                        \
+    Tvec lab = vec_mergel(a, b);                                                                   \
+    vsx_st(vec_sld(lab, hbc, 8), 4, ptr);                                                          \
+    static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
+    vsx_st(vec_perm(c, lab, clab), 8, ptr);                                                        \
+}                                                                                                  \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                   \
+{                                                                                                  \
+    Tvec v1 = vsx_ld(0, ptr);                                                                      \
+    Tvec v2 = vsx_ld(4, ptr);                                                                      \
+    Tvec v3 = vsx_ld(8, ptr);                                                                      \
+    static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31};   \
+    a = vec_perm(v1, vec_sld(v3, v2, 8), flp);                                                     \
+    static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19};  \
+    b = vec_perm(v2, vec_sld(v1, v3, 8), flp2);                                                    \
+    c = vec_perm(vec_sld(v2, v1, 8), v3, flp);                                                     \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4)
+VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4)
+VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
+
+#define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func)     \
+FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,   \
+                                     const Tvec& c, Tp* ptr)         \
+{                                                                    \
+    st_func(vec_mergeh(a, b), 0, ptr);                               \
+    st_func(vec_permi(c, a, 1), 2, ptr);                             \
+    st_func(vec_mergel(b, c), 4, ptr);                               \
+}                                                                    \
+FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a,       \
+                                       Tvec& b, Tvec& c)             \
+{                                                                    \
+    Tvec v1 = ld_func(0, ptr);                                       \
+    Tvec v2 = ld_func(2, ptr);                                       \
+    Tvec v3 = ld_func(4, ptr);                                       \
+    a = vec_permi(v1, v2, 1);                                        \
+    b = vec_permi(v1, v3, 2);                                        \
+    c = vec_permi(v2, v3, 1);                                        \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st)
+
+#endif // CV_VSX
+
+//! @}
+
+#endif // OPENCV_HAL_VSX_UTILS_HPP
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -85,7 +85,7 @@
 #include "opencv2/core/hal/intrin.hpp"
 #include "opencv2/core/sse_utils.hpp"
 #include "opencv2/core/neon_utils.hpp"
-
+#include "opencv2/core/vsx_utils.hpp"
 #include "arithm_core.hpp"
 #include "hal_replacement.hpp"

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -80,6 +80,18 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex();
 #  include <cpu-features.h>
 #endif

+#ifndef __VSX__
+# if defined __PPC64__ && defined __linux__
+#   include "sys/auxv.h"
+#   ifndef AT_HWCAP2
+#     define AT_HWCAP2 26
+#   endif
+#   ifndef PPC_FEATURE2_ARCH_2_07
+#     define PPC_FEATURE2_ARCH_2_07 0x80000000
+#   endif
+# endif
+#endif
+
 #if defined _WIN32 || defined WINCE
 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
  #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
@ -295,6 +307,8 @@ struct HWFeatures
        g_hwFeatureNames[CPU_AVX_512VL] = "AVX512VL";

        g_hwFeatureNames[CPU_NEON] = "NEON";
+
+        g_hwFeatureNames[CPU_VSX] = "VSX";
    }

    void initialize(void)
@ -504,6 +518,16 @@ struct HWFeatures
    #endif
    #endif

+    #ifdef __VSX__
+        have[CV_CPU_VSX] = true;
+    #elif (defined __PPC64__ && defined __linux__)
+        uint64 hwcaps = getauxval(AT_HWCAP);
+        uint64 hwcap2 = getauxval(AT_HWCAP2);
+        have[CV_CPU_VSX] = (hwcaps & PPC_FEATURE_PPC_LE && hwcaps & PPC_FEATURE_HAS_VSX && hwcap2 & PPC_FEATURE2_ARCH_2_07);
+    #else
+        have[CV_CPU_VSX] = false;
+    #endif
+
        int baseline_features[] = { CV_CPU_BASELINE_FEATURES };
        if (!checkFeatures(baseline_features, sizeof(baseline_features) / sizeof(baseline_features[0])))
        {
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@ -3091,6 +3091,9 @@ void printVersionInfo(bool useStdOut)
 #if CV_FP16
    if (checkHardwareSupport(CV_CPU_FP16)) cpu_features += " fp16";
 #endif
+#if CV_VSX
+    if (checkHardwareSupport(CV_CPU_VSX)) cpu_features += " VSX";
+#endif

    cpu_features.erase(0, 1); // erase initial space