@ -51,18 +51,6 @@
//! @{
# if CV_VSX
# define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
# define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
FORCE_INLINE ( rt ) fnm ( const rg & a ) { return fn2 ( a ) ; }
# define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
FORCE_INLINE ( rt ) fnm ( const rg & a , const rg & b ) { return fn2 ( a , b ) ; }
# define VSX_IMPL_PERM(rt, fnm, ...) \
FORCE_INLINE ( rt ) fnm ( const rt & a , const rt & b ) \
{ static const vec_uchar16 perm = { __VA_ARGS__ } ; return vec_perm ( a , b , perm ) ; }
# define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
# define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v}
# define __VSX_S4__(c, v) (c){v, v, v, v}
@ -172,10 +160,19 @@ typedef __vector double vec_double2;
# define vec_bdword2_f (__VSX_S2__(vec_bdword2, 0))
# define vec_bdword2_t (__VSX_S2__(vec_bdword2, 1))
# define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
# define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
FORCE_INLINE ( rt ) fnm ( const rg & a ) { return fn2 ( a ) ; }
# define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
FORCE_INLINE ( rt ) fnm ( const rg & a , const rg & b ) { return fn2 ( a , b ) ; }
/*
* GCC VSX compatibility
* */
# if defined(__GNUG__) && !defined(__IBMCPP__) && !defined(__clang__)
# if defined(__GNUG__) && !defined(__clang__)
// inline asm helper
# define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \
@ -193,7 +190,7 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
# define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
# if __GNUG__ < 7
/* up to GCC 6 vec_mul only supports precisions and llong */
// up to GCC 6 vec_mul only supports precisions and llong
# ifdef vec_mul
# undef vec_mul
# endif
@ -209,15 +206,15 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
8 , 9 , 24 , 25 , 12 , 13 , 28 , 29 } ; \
return vec_perm ( Tcast ( vec_mule ( a , b ) ) , Tcast ( vec_mulo ( a , b ) ) , even_perm ) ; \
}
VSX_IMPL_MULH ( vec_short8 , vec_short8_c )
VSX_IMPL_MULH ( vec_short8 , vec_short8_c )
VSX_IMPL_MULH ( vec_ushort8 , vec_ushort8_c )
/* vmuluwm can be used for unsigned or signed integers, that's what they said */
VSX_IMPL_2VRG ( vec_int4 , vec_int4 , vmuluwm , vec_mul )
// vmuluwm can be used for unsigned or signed integers, that's what they said
VSX_IMPL_2VRG ( vec_int4 , vec_int4 , vmuluwm , vec_mul )
VSX_IMPL_2VRG ( vec_uint4 , vec_uint4 , vmuluwm , vec_mul )
/* redirect to GCC builtin vec_mul, since it already supports precisions and llong */
VSX_REDIRECT_2RG ( vec_float4 , vec_float4 , vec_mul , __builtin_vec_mul )
// redirect to GCC builtin vec_mul, since it already supports precisions and llong
VSX_REDIRECT_2RG ( vec_float4 , vec_float4 , vec_mul , __builtin_vec_mul )
VSX_REDIRECT_2RG ( vec_double2 , vec_double2 , vec_mul , __builtin_vec_mul )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mul , __builtin_vec_mul )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mul , __builtin_vec_mul )
VSX_REDIRECT_2RG ( vec_udword2 , vec_udword2 , vec_mul , __builtin_vec_mul )
# endif // __GNUG__ < 7
@ -237,74 +234,120 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
# define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
VSX_IMPL_2VRG_F ( rt , rg , # opc " %0,%2,%1 \n \t xxlnor %x0,%x0,%x0 " , fnm )
VSX_IMPL_CMPGE ( vec_bchar16 , vec_char16 , vcmpgtsb , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bchar16 , vec_char16 , vcmpgtsb , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bchar16 , vec_uchar16 , vcmpgtub , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bshort8 , vec_short8 , vcmpgtsh , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bshort8 , vec_short8 , vcmpgtsh , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bshort8 , vec_ushort8 , vcmpgtuh , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bint4 , vec_int4 , vcmpgtsw , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bint4 , vec_uint4 , vcmpgtuw , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bdword2 , vec_dword2 , vcmpgtsd , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bint4 , vec_int4 , vcmpgtsw , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bint4 , vec_uint4 , vcmpgtuw , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bdword2 , vec_dword2 , vcmpgtsd , vec_cmpge )
VSX_IMPL_CMPGE ( vec_bdword2 , vec_udword2 , vcmpgtud , vec_cmpge )
/* redirect to GCC builtin cmpge, since it already supports precisions */
VSX_REDIRECT_2RG ( vec_bint4 , vec_float4 , vec_cmpge , __builtin_vec_cmpge )
// redirect to GCC builtin cmpge, since it already supports precisions
VSX_REDIRECT_2RG ( vec_bint4 , vec_float4 , vec_cmpge , __builtin_vec_cmpge )
VSX_REDIRECT_2RG ( vec_bdword2 , vec_double2 , vec_cmpge , __builtin_vec_cmpge )
// up to gcc5 vec_nor doesn't support bool long long
# undef vec_nor
template < typename T >
VSX_REDIRECT_2RG ( T , T , vec_nor , __builtin_vec_nor )
template < typename T >
VSX_REDIRECT_2RG ( T , T , vec_nor , __builtin_vec_nor )
FORCE_INLINE ( vec_bdword2 ) vec_nor ( const vec_bdword2 & a , const vec_bdword2 & b )
{ return vec_bdword2_c ( __builtin_vec_nor ( vec_dword2_c ( a ) , vec_dword2_c ( b ) ) ) ; }
FORCE_INLINE ( vec_bdword2 ) vec_nor ( const vec_bdword2 & a , const vec_bdword2 & b )
{ return vec_bdword2_c ( __builtin_vec_nor ( vec_dword2_c ( a ) , vec_dword2_c ( b ) ) ) ; }
# endif // __GNUG__ < 6
// vector population count
# ifndef vec_popcnt
VSX_IMPL_1VRG ( vec_uchar16 , vec_uchar16 , vpopcntb , vec_popcnt )
VSX_IMPL_1VRG ( vec_uchar16 , vec_char16 , vpopcntb , vec_popcnt )
VSX_IMPL_1VRG ( vec_ushort8 , vec_ushort8 , vpopcnth , vec_popcnt )
VSX_IMPL_1VRG ( vec_ushort8 , vec_short8 , vpopcnth , vec_popcnt )
VSX_IMPL_1VRG ( vec_uint4 , vec_uint4 , vpopcntw , vec_popcnt )
VSX_IMPL_1VRG ( vec_uint4 , vec_int4 , vpopcntw , vec_popcnt )
VSX_IMPL_1VRG ( vec_udword2 , vec_udword2 , vpopcntd , vec_popcnt )
VSX_IMPL_1VRG ( vec_udword2 , vec_dword2 , vpopcntd , vec_popcnt )
# endif // vec_popcnt
# if __GNUG__ < 5
// vec_xxpermdi in gcc4 missing little-endian supports just like clang
# define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
// vec_packs doesn't support double words in gcc4
# undef vec_packs
VSX_REDIRECT_2RG ( vec_char16 , vec_short8 , vec_packs , __builtin_vec_packs )
VSX_REDIRECT_2RG ( vec_uchar16 , vec_ushort8 , vec_packs , __builtin_vec_packs )
VSX_REDIRECT_2RG ( vec_short8 , vec_int4 , vec_packs , __builtin_vec_packs )
VSX_REDIRECT_2RG ( vec_ushort8 , vec_uint4 , vec_packs , __builtin_vec_packs )
VSX_IMPL_2VRG_F ( vec_int4 , vec_dword2 , " vpksdss %0,%2,%1 " , vec_packs )
VSX_IMPL_2VRG_F ( vec_uint4 , vec_udword2 , " vpkudus %0,%2,%1 " , vec_packs )
# undef vec_packs
VSX_REDIRECT_2RG ( vec_char16 , vec_short8 , vec_packs , __builtin_vec_packs )
VSX_REDIRECT_2RG ( vec_uchar16 , vec_ushort8 , vec_packs , __builtin_vec_packs )
VSX_REDIRECT_2RG ( vec_short8 , vec_int4 , vec_packs , __builtin_vec_packs )
VSX_REDIRECT_2RG ( vec_ushort8 , vec_uint4 , vec_packs , __builtin_vec_packs )
VSX_IMPL_2VRG_F ( vec_int4 , vec_dword2 , " vpksdss %0,%2,%1 " , vec_packs )
VSX_IMPL_2VRG_F ( vec_uint4 , vec_udword2 , " vpkudus %0,%2,%1 " , vec_packs )
# else
# define vec_permi vec_xxpermdi
# endif // __GNUG__ < 5
// shift left double by word immediate
# ifndef vec_sldw
# define vec_sldw __builtin_vsx_xxsldwi
# endif
// vector population count
VSX_IMPL_1VRG ( vec_uchar16 , vec_uchar16 , vpopcntb , vec_popcntu )
VSX_IMPL_1VRG ( vec_uchar16 , vec_char16 , vpopcntb , vec_popcntu )
VSX_IMPL_1VRG ( vec_ushort8 , vec_ushort8 , vpopcnth , vec_popcntu )
VSX_IMPL_1VRG ( vec_ushort8 , vec_short8 , vpopcnth , vec_popcntu )
VSX_IMPL_1VRG ( vec_uint4 , vec_uint4 , vpopcntw , vec_popcntu )
VSX_IMPL_1VRG ( vec_uint4 , vec_int4 , vpopcntw , vec_popcntu )
VSX_IMPL_1VRG ( vec_udword2 , vec_udword2 , vpopcntd , vec_popcntu )
VSX_IMPL_1VRG ( vec_udword2 , vec_dword2 , vpopcntd , vec_popcntu )
// converts between single and double-precision
# ifndef vec_cvf
VSX_REDIRECT_1RG ( vec_float4 , vec_double2 , vec_cvf , __builtin_vsx_xvcvdpsp )
FORCE_INLINE ( vec_double2 ) vec_cvf ( const vec_float4 & a )
{ return __builtin_vsx_xvcvspdp ( vec_sld ( a , a , 4 ) ) ; }
# ifdef vec_cvf
# undef vec_cvf
# endif
VSX_REDIRECT_1RG ( vec_float4 , vec_double2 , vec_cvf , __builtin_vsx_xvcvdpsp )
VSX_REDIRECT_1RG ( vec_double2 , vec_float4 , vec_cvfo , __builtin_vsx_xvcvspdp )
FORCE_INLINE ( vec_double2 ) vec_cvf ( const vec_float4 & a )
{ return vec_cvfo ( vec_sldw ( a , a , 1 ) ) ; }
// converts 32 and 64 bit integers to double-precision
# ifndef vec_ctd
# define vec_ctd(a, b) __vec_ctd(a)
VSX_IMPL_1RG ( vec_double2 , wd , vec_int4 , wa , xvcvsxwdp , __vec_ctd )
VSX_IMPL_1RG ( vec_double2 , wd , vec_uint4 , wa , xvcvuxwdp , __vec_ctd )
VSX_IMPL_1RG ( vec_double2 , wd , vec_dword2 , wi , xvcvsxddp , __vec_ctd )
VSX_IMPL_1RG ( vec_double2 , wd , vec_udword2 , wi , xvcvuxddp , __vec_ctd )
// converts word and doubleword to double-precision
# ifdef vec_ctd
# undef vec_ctd
# endif
VSX_IMPL_1RG ( vec_double2 , wd , vec_int4 , wa , xvcvsxwdp , vec_ctdo )
VSX_IMPL_1RG ( vec_double2 , wd , vec_uint4 , wa , xvcvuxwdp , vec_ctdo )
VSX_IMPL_1RG ( vec_double2 , wd , vec_dword2 , wi , xvcvsxddp , vec_ctd )
VSX_IMPL_1RG ( vec_double2 , wd , vec_udword2 , wi , xvcvuxddp , vec_ctd )
FORCE_INLINE ( vec_double2 ) vec_ctd ( const vec_int4 & a )
{ return vec_ctdo ( vec_sldw ( a , a , 1 ) ) ; }
FORCE_INLINE ( vec_double2 ) vec_ctd ( const vec_uint4 & a )
{ return vec_ctdo ( vec_sldw ( a , a , 1 ) ) ; }
// converts word and doubleword to single-precision
# undef vec_ctf
VSX_IMPL_1RG ( vec_float4 , wf , vec_int4 , wa , xvcvsxwsp , vec_ctf )
VSX_IMPL_1RG ( vec_float4 , wf , vec_uint4 , wa , xvcvuxwsp , vec_ctf )
VSX_IMPL_1RG ( vec_float4 , wf , vec_dword2 , wi , xvcvsxdsp , vec_ctf )
VSX_IMPL_1RG ( vec_float4 , wf , vec_udword2 , wi , xvcvuxdsp , vec_ctf )
// converts single and double precision to signed word
# undef vec_cts
VSX_IMPL_1RG ( vec_int4 , wa , vec_double2 , wd , xvcvdpsxws , vec_cts )
VSX_IMPL_1RG ( vec_int4 , wa , vec_float4 , wf , xvcvspsxws , vec_cts )
// converts single and double precision to unsigned word
# undef vec_ctu
VSX_IMPL_1RG ( vec_uint4 , wa , vec_double2 , wd , xvcvdpuxws , vec_ctu )
VSX_IMPL_1RG ( vec_uint4 , wa , vec_float4 , wf , xvcvspuxws , vec_ctu )
// converts single and double precision to signed doubleword
# ifdef vec_ctsl
# undef vec_ctsl
# endif
VSX_IMPL_1RG ( vec_dword2 , wi , vec_double2 , wd , xvcvdpsxds , vec_ctsl )
VSX_IMPL_1RG ( vec_dword2 , wi , vec_float4 , wf , xvcvspsxds , vec_ctslo )
// shift left double by word immediate
# ifndef vec_sldw
# define vec_sldw __builtin_vsx_xxsldwi
FORCE_INLINE ( vec_dword2 ) vec_ctsl ( const vec_float4 & a )
{ return vec_ctslo ( vec_sldw ( a , a , 1 ) ) ; }
// converts single and double precision to unsigned doubleword
# ifdef vec_ctul
# undef vec_ctul
# endif
VSX_IMPL_1RG ( vec_udword2 , wi , vec_double2 , wd , xvcvdpuxds , vec_ctul )
VSX_IMPL_1RG ( vec_udword2 , wi , vec_float4 , wf , xvcvspuxds , vec_ctulo )
FORCE_INLINE ( vec_udword2 ) vec_ctul ( const vec_float4 & a )
{ return vec_ctulo ( vec_sldw ( a , a , 1 ) ) ; }
// just in case if GCC doesn't define it
# ifndef vec_xl
@ -327,8 +370,13 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
* Also there ' s already an open bug https : //bugs.llvm.org/show_bug.cgi?id=31837
*
* So we ' re not able to use inline asm and only use built - in functions that CLANG supports
* and use __builtin_convertvector if clang missng any of vector conversions built - in functions
*/
// convert vector helper
# define VSX_IMPL_CONVERT(rt, rg, fnm) \
FORCE_INLINE ( rt ) fnm ( const rg & a ) { return __builtin_convertvector ( a , rt ) ; }
# if __clang_major__ < 5
// implement vec_permi in a dirty way
# define VSX_IMPL_CLANG_4_PERMI(Tvec) \
@ -362,26 +410,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
# define vec_sldw vec_xxsldwi
# endif
/* converts between single and double precision */
# ifndef vec_cvf
VSX_REDIRECT_1RG ( vec_float4 , vec_double2 , vec_cvf , __builtin_vsx_xvcvdpsp )
FORCE_INLINE ( vec_double2 ) vec_cvf ( const vec_float4 & a )
{ return __builtin_vsx_xvcvspdp ( vec_sld ( a , a , 4 ) ) ; }
# endif
/* converts 32 and 64 bit integers to double-precision */
# ifndef vec_ctd
# define vec_ctd(a, b) __vec_ctd(a)
VSX_REDIRECT_1RG ( vec_double2 , vec_int4 , __vec_ctd , __builtin_vsx_xvcvsxwdp )
VSX_REDIRECT_1RG ( vec_double2 , vec_uint4 , __vec_ctd , __builtin_vsx_xvcvuxwdp )
// implement vec_ctd for double word in a dirty way since we are missing builtin xvcvsxddp, xvcvuxddp
// please try to avoid using it for double words
FORCE_INLINE ( vec_double2 ) __vec_ctd ( const vec_dword2 & a )
{ return vec_double2_set ( ( double ) vec_extract ( a , 0 ) , ( double ) vec_extract ( a , 1 ) ) ; }
FORCE_INLINE ( vec_double2 ) __vec_ctd ( const vec_udword2 & a )
{ return vec_double2_set ( ( double ) vec_extract ( a , 0 ) , ( double ) vec_extract ( a , 1 ) ) ; }
# endif
// Implement vec_rsqrt since clang only supports vec_rsqrte
# ifndef vec_rsqrt
FORCE_INLINE ( vec_float4 ) vec_rsqrt ( const vec_float4 & a )
@ -391,27 +419,157 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
{ return vec_div ( vec_double2_sp ( 1 ) , vec_sqrt ( a ) ) ; }
# endif
// vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
# define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast) \
FORCE_INLINE ( Tvec ) vec_popcntu ( const Tvec2 & a ) \
{ return ucast ( vec_popcnt ( a ) ) ; }
VSX_IMPL_POPCNTU ( vec_uchar16 , vec_char16 , vec_uchar16_c ) ;
VSX_IMPL_POPCNTU ( vec_ushort8 , vec_short8 , vec_ushort8_c ) ;
VSX_IMPL_POPCNTU ( vec_uint4 , vec_int4 , vec_uint4_c ) ;
// redirect unsigned types
VSX_REDIRECT_1RG ( vec_uchar16 , vec_uchar16 , vec_popcntu , vec_popcnt )
VSX_REDIRECT_1RG ( vec_ushort8 , vec_ushort8 , vec_popcntu , vec_popcnt )
VSX_REDIRECT_1RG ( vec_uint4 , vec_uint4 , vec_popcntu , vec_popcnt )
// converts between single and double precision
# ifdef vec_cvf
# undef vec_cvf
# endif
VSX_REDIRECT_1RG ( vec_float4 , vec_double2 , vec_cvf , __builtin_vsx_xvcvdpsp )
VSX_REDIRECT_1RG ( vec_double2 , vec_float4 , vec_cvfo , __builtin_vsx_xvcvspdp )
/*
* __builtin_altivec_vctsxs in clang 5 and 6 causes ambiguous which used by vec_cts
* so we just redefine it and cast it
*/
FORCE_INLINE ( vec_double2 ) vec_cvf ( const vec_float4 & a )
{ return vec_cvfo ( vec_sldw ( a , a , 1 ) ) ; }
// converts word and doubleword to double-precision
# ifdef vec_ctd
# undef vec_ctd
# endif
VSX_REDIRECT_1RG ( vec_double2 , vec_int4 , vec_ctdo , __builtin_vsx_xvcvsxwdp )
VSX_REDIRECT_1RG ( vec_double2 , vec_uint4 , vec_ctdo , __builtin_vsx_xvcvuxwdp )
VSX_IMPL_CONVERT ( vec_double2 , vec_dword2 , vec_ctd )
VSX_IMPL_CONVERT ( vec_double2 , vec_udword2 , vec_ctd )
FORCE_INLINE ( vec_double2 ) vec_ctd ( const vec_int4 & a )
{ return vec_ctdo ( vec_sldw ( a , a , 1 ) ) ; }
FORCE_INLINE ( vec_double2 ) vec_ctd ( const vec_uint4 & a )
{ return vec_ctdo ( vec_sldw ( a , a , 1 ) ) ; }
// converts word and doubleword to single-precision
# if __clang_major__ > 4
# undef vec_ctf
# endif
VSX_IMPL_CONVERT ( vec_float4 , vec_int4 , vec_ctf )
VSX_IMPL_CONVERT ( vec_float4 , vec_uint4 , vec_ctf )
VSX_REDIRECT_1RG ( vec_float4 , vec_dword2 , vec_ctf , __builtin_vsx_xvcvsxdsp )
VSX_REDIRECT_1RG ( vec_float4 , vec_udword2 , vec_ctf , __builtin_vsx_xvcvuxdsp )
// converts single and double precision to signed word
# if __clang_major__ > 4
# undef vec_cts
# define vec_cts(__a, __b) \
_Generic ( ( __a ) , vector float \
: ( vector signed int ) __builtin_altivec_vctsxs ( ( __a ) , ( __b ) ) , vector double \
: __extension__ ( { \
vector double __ret = \
( __a ) * \
( vector double ) ( vector unsigned long long ) ( ( 0x3ffULL + ( __b ) ) \
< < 52 ) ; \
__builtin_convertvector ( __ret , vector signed long long ) ; \
} ) )
# endif // __clang_major__ > 4
# endif
VSX_REDIRECT_1RG ( vec_int4 , vec_double2 , vec_cts , __builtin_vsx_xvcvdpsxws )
VSX_IMPL_CONVERT ( vec_int4 , vec_float4 , vec_cts )
// converts single and double precision to unsigned word
# if __clang_major__ > 4
# undef vec_ctu
# endif
VSX_REDIRECT_1RG ( vec_uint4 , vec_double2 , vec_ctu , __builtin_vsx_xvcvdpuxws )
VSX_IMPL_CONVERT ( vec_uint4 , vec_float4 , vec_ctu )
// converts single and double precision to signed doubleword
# ifdef vec_ctsl
# undef vec_ctsl
# endif
VSX_IMPL_CONVERT ( vec_dword2 , vec_double2 , vec_ctsl )
// __builtin_convertvector unable to convert, xvcvspsxds is missing on it
FORCE_INLINE ( vec_dword2 ) vec_ctslo ( const vec_float4 & a )
{ return vec_ctsl ( vec_cvfo ( a ) ) ; }
FORCE_INLINE ( vec_dword2 ) vec_ctsl ( const vec_float4 & a )
{ return vec_ctsl ( vec_cvf ( a ) ) ; }
// converts single and double precision to unsigned doubleword
# ifdef vec_ctul
# undef vec_ctul
# endif
VSX_IMPL_CONVERT ( vec_udword2 , vec_double2 , vec_ctul )
// __builtin_convertvector unable to convert, xvcvspuxds is missing on it
FORCE_INLINE ( vec_udword2 ) vec_ctulo ( const vec_float4 & a )
{ return vec_ctul ( vec_cvfo ( a ) ) ; }
FORCE_INLINE ( vec_udword2 ) vec_ctul ( const vec_float4 & a )
{ return vec_ctul ( vec_cvf ( a ) ) ; }
# endif // CLANG VSX compatibility
/*
* XLC VSX compatibility
* */
# if defined(__IBMCPP__)
// vector population count
# define vec_popcntu vec_popcnt
// overload and redirect wih setting second arg to zero
// since we only support conversions without the second arg
# define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
FORCE_INLINE ( rt ) fnm ( const rg & a ) { return fnm ( a , 0 ) ; }
VSX_IMPL_OVERLOAD_Z2 ( vec_double2 , vec_int4 , vec_ctd )
VSX_IMPL_OVERLOAD_Z2 ( vec_double2 , vec_uint4 , vec_ctd )
VSX_IMPL_OVERLOAD_Z2 ( vec_double2 , vec_dword2 , vec_ctd )
VSX_IMPL_OVERLOAD_Z2 ( vec_double2 , vec_udword2 , vec_ctd )
VSX_IMPL_OVERLOAD_Z2 ( vec_float4 , vec_int4 , vec_ctf )
VSX_IMPL_OVERLOAD_Z2 ( vec_float4 , vec_uint4 , vec_ctf )
VSX_IMPL_OVERLOAD_Z2 ( vec_float4 , vec_dword2 , vec_ctf )
VSX_IMPL_OVERLOAD_Z2 ( vec_float4 , vec_udword2 , vec_ctf )
VSX_IMPL_OVERLOAD_Z2 ( vec_int4 , vec_double2 , vec_cts )
VSX_IMPL_OVERLOAD_Z2 ( vec_int4 , vec_float4 , vec_cts )
VSX_IMPL_OVERLOAD_Z2 ( vec_uint4 , vec_double2 , vec_ctu )
VSX_IMPL_OVERLOAD_Z2 ( vec_uint4 , vec_float4 , vec_ctu )
VSX_IMPL_OVERLOAD_Z2 ( vec_dword2 , vec_double2 , vec_ctsl )
VSX_IMPL_OVERLOAD_Z2 ( vec_dword2 , vec_float4 , vec_ctsl )
VSX_IMPL_OVERLOAD_Z2 ( vec_udword2 , vec_double2 , vec_ctul )
VSX_IMPL_OVERLOAD_Z2 ( vec_udword2 , vec_float4 , vec_ctul )
// fixme: implement conversions of odd-numbered elements in a dirty way
// since xlc doesn't support VSX registers operand in inline asm.
# define VSX_IMPL_DIRTY_ODD(rt, rg, fnm, fn2) \
FORCE_INLINE ( rt ) fnm ( const rg & a ) { return fn2 ( vec_sldw ( a , a , 3 ) ) ; }
VSX_IMPL_DIRTY_ODD ( vec_double2 , vec_float4 , vec_cvfo , vec_cvf )
VSX_IMPL_DIRTY_ODD ( vec_double2 , vec_int4 , vec_ctdo , vec_ctd )
VSX_IMPL_DIRTY_ODD ( vec_double2 , vec_uint4 , vec_ctdo , vec_ctd )
VSX_IMPL_DIRTY_ODD ( vec_dword2 , vec_float4 , vec_ctslo , vec_ctsl )
VSX_IMPL_DIRTY_ODD ( vec_udword2 , vec_float4 , vec_ctulo , vec_ctul )
# endif // XLC VSX compatibility
// ignore GCC warning that casued by -Wunused-but-set-variable in rare cases
# if defined(__GNUG__) && !defined(__clang__)
# define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
# else // CLANG, XLC
# define VSX_UNUSED(Tvec) Tvec
# endif
// gcc can find his way in casting log int and XLC, CLANG ambiguous
# if defined(__clang__) || defined(__IBMCPP__)
FORCE_INLINE ( vec_udword2 ) vec_splats ( uint64 v )
{ return vec_splats ( ( unsigned long long ) v ) ; }
FORCE_INLINE ( vec_dword2 ) vec_splats ( int64 v )
{ return vec_splats ( ( long long ) v ) ; }
# endif
/*
* implement vsx_ld ( offset , pointer ) , vsx_st ( vector , offset , pointer )
* load and set using offset depend on the pointer type
@ -468,75 +626,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
{ vsx_stf ( vec , VSX_OFFSET ( o , p ) , ( long long * ) p ) ; }
# endif
# if defined(__clang__) || defined(__IBMCPP__)
// gcc can find his way in casting log int and XLC, CLANG ambiguous
FORCE_INLINE ( vec_udword2 ) vec_splats ( uint64 v )
{ return vec_splats ( ( unsigned long long ) v ) ; }
FORCE_INLINE ( vec_dword2 ) vec_splats ( int64 v )
{ return vec_splats ( ( long long ) v ) ; }
# endif
// Implement store vector bool char for XLC
# if defined(__IBMCPP__) && defined(__clang__)
FORCE_INLINE ( void ) vec_xst ( const vec_bchar16 & vec , long o , uchar * p )
{ vec_xst ( vec_uchar16_c ( vec ) , VSX_OFFSET ( o , p ) , p ) ; }
# endif
// Working around vec_popcnt compatibility
/*
* vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
*
* use vec_popcntu instead to deal with it
*/
# if defined(__clang__) && !defined(__IBMCPP__)
# define VSX_IMPL_CLANG_POPCNTU(Tvec, Tvec2, ucast) \
FORCE_INLINE ( Tvec ) vec_popcntu ( const Tvec2 & a ) \
{ return ucast ( vec_popcnt ( a ) ) ; }
VSX_IMPL_CLANG_POPCNTU ( vec_uchar16 , vec_char16 , vec_uchar16_c ) ;
VSX_IMPL_CLANG_POPCNTU ( vec_ushort8 , vec_short8 , vec_ushort8_c ) ;
VSX_IMPL_CLANG_POPCNTU ( vec_uint4 , vec_int4 , vec_uint4_c ) ;
// redirect unsigned types
VSX_REDIRECT_1RG ( vec_uchar16 , vec_uchar16 , vec_popcntu , vec_popcnt )
VSX_REDIRECT_1RG ( vec_ushort8 , vec_ushort8 , vec_popcntu , vec_popcnt )
VSX_REDIRECT_1RG ( vec_uint4 , vec_uint4 , vec_popcntu , vec_popcnt )
# else
# define vec_popcntu vec_popcnt
# endif
// Working around vec_cts compatibility
/*
* vec_cts in gcc and clang converts single - precision to signed fixed - point word
* and from double - precision to signed doubleword , also there ' s no implement for vec_ctsl
*
* vec_cts in xlc converts single and double precision to signed fixed - point word
* and xlc has vec_ctsl which converts single and double precision to signed doubleword
*
* so to deal with this situation , use vec_cts only if you want to convert single - precision to signed fixed - point word
* and use vec_ctsl when you want to convert double - precision to signed doubleword
*
* Also we implemented vec_ctsw ( a ) to convert double - precision to signed fixed - point word
*/
// converts double-precision to signed doubleword for GCC and CLANG
# if !defined(vec_ctsl) && !defined(__IBMCPP__) && (defined(__GNUG__) || defined(__clang__))
// GCC4 has incorrect results in convert to signed doubleword
# if !defined(__clang__) && __GNUG__ < 5
# define vec_ctsl(a, b) __vec_ctsl(a)
VSX_IMPL_1RG ( vec_dword2 , wi , vec_double2 , wd , xvcvdpsxds , __vec_ctsl )
# else // GCC > 4 , CLANG
# define vec_ctsl vec_cts
# endif
# endif
// converts double-precision to signed fixed-point word
# if defined(__IBMCPP__)
# define vec_ctsw(a) vec_cts(a, 0)
# else // GCC, CLANG
# define vec_ctsw(a) vec_int4_c(__builtin_vsx_xvcvdpsxws(a))
# endif
// load 4 unsigned bytes into uint4 vector
# define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
@ -566,14 +655,14 @@ FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p)
return vec_and ( vec_ld_l8 ( p ) , ( Tvec ) mask ) ; \
}
VSX_IMPL_LOAD_L8 ( vec_uchar16 , uchar )
VSX_IMPL_LOAD_L8 ( vec_char16 , schar )
VSX_IMPL_LOAD_L8 ( vec_char16 , schar )
VSX_IMPL_LOAD_L8 ( vec_ushort8 , ushort )
VSX_IMPL_LOAD_L8 ( vec_short8 , short )
VSX_IMPL_LOAD_L8 ( vec_uint4 , uint )
VSX_IMPL_LOAD_L8 ( vec_int4 , int )
VSX_IMPL_LOAD_L8 ( vec_float4 , float )
VSX_IMPL_LOAD_L8 ( vec_short8 , short )
VSX_IMPL_LOAD_L8 ( vec_uint4 , uint )
VSX_IMPL_LOAD_L8 ( vec_int4 , int )
VSX_IMPL_LOAD_L8 ( vec_float4 , float )
VSX_IMPL_LOAD_L8 ( vec_udword2 , uint64 )
VSX_IMPL_LOAD_L8 ( vec_dword2 , int64 )
VSX_IMPL_LOAD_L8 ( vec_dword2 , int64 )
VSX_IMPL_LOAD_L8 ( vec_double2 , double )
// logical not
@ -601,41 +690,45 @@ FORCE_INLINE(rt) vec_unpackhu(const rg& a) \
{ return reinterpret_cast < rt > ( vec_mergeh ( a , zero ) ) ; }
VSX_IMPL_UNPACKU ( vec_ushort8 , vec_uchar16 , vec_uchar16_z )
VSX_IMPL_UNPACKU ( vec_uint4 , vec_ushort8 , vec_ushort8_z )
VSX_IMPL_UNPACKU ( vec_udword2 , vec_uint4 , vec_uint4_z )
VSX_IMPL_UNPACKU ( vec_uint4 , vec_ushort8 , vec_ushort8_z )
VSX_IMPL_UNPACKU ( vec_udword2 , vec_uint4 , vec_uint4_z )
/*
* Implement vec_mergesqe and vec_mergesqo
* Merges the sequence values of even and odd elements of two vectors
*/
# define VSX_IMPL_PERM(rt, fnm, ...) \
FORCE_INLINE ( rt ) fnm ( const rt & a , const rt & b ) \
{ static const vec_uchar16 perm = { __VA_ARGS__ } ; return vec_perm ( a , b , perm ) ; }
// 16
# define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
# define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
VSX_IMPL_PERM ( vec_uchar16 , vec_mergesqe , perm16_mergesqe )
VSX_IMPL_PERM ( vec_uchar16 , vec_mergesqo , perm16_mergesqo )
VSX_IMPL_PERM ( vec_char16 , vec_mergesqe , perm16_mergesqe )
VSX_IMPL_PERM ( vec_char16 , vec_mergesqo , perm16_mergesqo )
VSX_IMPL_PERM ( vec_char16 , vec_mergesqe , perm16_mergesqe )
VSX_IMPL_PERM ( vec_char16 , vec_mergesqo , perm16_mergesqo )
// 8
# define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
# define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
VSX_IMPL_PERM ( vec_ushort8 , vec_mergesqe , perm8_mergesqe )
VSX_IMPL_PERM ( vec_ushort8 , vec_mergesqo , perm8_mergesqo )
VSX_IMPL_PERM ( vec_short8 , vec_mergesqe , perm8_mergesqe )
VSX_IMPL_PERM ( vec_short8 , vec_mergesqo , perm8_mergesqo )
VSX_IMPL_PERM ( vec_short8 , vec_mergesqe , perm8_mergesqe )
VSX_IMPL_PERM ( vec_short8 , vec_mergesqo , perm8_mergesqo )
// 4
# define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
# define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
VSX_IMPL_PERM ( vec_uint4 , vec_mergesqe , perm4_mergesqe )
VSX_IMPL_PERM ( vec_uint4 , vec_mergesqo , perm4_mergesqo )
VSX_IMPL_PERM ( vec_int4 , vec_mergesqe , perm4_mergesqe )
VSX_IMPL_PERM ( vec_int4 , vec_mergesqo , perm4_mergesqo )
VSX_IMPL_PERM ( vec_uint4 , vec_mergesqe , perm4_mergesqe )
VSX_IMPL_PERM ( vec_uint4 , vec_mergesqo , perm4_mergesqo )
VSX_IMPL_PERM ( vec_int4 , vec_mergesqe , perm4_mergesqe )
VSX_IMPL_PERM ( vec_int4 , vec_mergesqo , perm4_mergesqo )
VSX_IMPL_PERM ( vec_float4 , vec_mergesqe , perm4_mergesqe )
VSX_IMPL_PERM ( vec_float4 , vec_mergesqo , perm4_mergesqo )
// 2
VSX_REDIRECT_2RG ( vec_double2 , vec_double2 , vec_mergesqe , vec_mergeh )
VSX_REDIRECT_2RG ( vec_double2 , vec_double2 , vec_mergesqo , vec_mergel )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mergesqe , vec_mergeh )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mergesqo , vec_mergel )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mergesqe , vec_mergeh )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mergesqo , vec_mergel )
VSX_REDIRECT_2RG ( vec_udword2 , vec_udword2 , vec_mergesqe , vec_mergeh )
VSX_REDIRECT_2RG ( vec_udword2 , vec_udword2 , vec_mergesqo , vec_mergel )
@ -657,8 +750,8 @@ VSX_IMPL_MERGESQHL(vec_int4)
VSX_IMPL_MERGESQHL ( vec_float4 )
VSX_REDIRECT_2RG ( vec_udword2 , vec_udword2 , vec_mergesqh , vec_mergeh )
VSX_REDIRECT_2RG ( vec_udword2 , vec_udword2 , vec_mergesql , vec_mergel )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mergesqh , vec_mergeh )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mergesql , vec_mergel )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mergesqh , vec_mergeh )
VSX_REDIRECT_2RG ( vec_dword2 , vec_dword2 , vec_mergesql , vec_mergel )
VSX_REDIRECT_2RG ( vec_double2 , vec_double2 , vec_mergesqh , vec_mergeh )
VSX_REDIRECT_2RG ( vec_double2 , vec_double2 , vec_mergesql , vec_mergel )
@ -682,13 +775,13 @@ FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
vsx_stf ( vec_mergeh ( ac , bd ) , 32 , ptr ) ; \
vsx_stf ( vec_mergel ( ac , bd ) , 48 , ptr ) ; \
}
VSX_IMPL_ST_INTERLEAVE ( uchar , vec_uchar16 )
VSX_IMPL_ST_INTERLEAVE ( schar , vec_char16 )
VSX_IMPL_ST_INTERLEAVE ( uchar , vec_uchar16 )
VSX_IMPL_ST_INTERLEAVE ( schar , vec_char16 )
VSX_IMPL_ST_INTERLEAVE ( ushort , vec_ushort8 )
VSX_IMPL_ST_INTERLEAVE ( short , vec_short8 )
VSX_IMPL_ST_INTERLEAVE ( uint , vec_uint4 )
VSX_IMPL_ST_INTERLEAVE ( int , vec_int4 )
VSX_IMPL_ST_INTERLEAVE ( float , vec_float4 )
VSX_IMPL_ST_INTERLEAVE ( short , vec_short8 )
VSX_IMPL_ST_INTERLEAVE ( uint , vec_uint4 )
VSX_IMPL_ST_INTERLEAVE ( int , vec_int4 )
VSX_IMPL_ST_INTERLEAVE ( float , vec_float4 )
// 2 and 4 channels deinterleave for 16 lanes
# define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \
@ -748,7 +841,7 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
d = vec_mergesql ( cd0 , cd1 ) ; \
}
VSX_IMPL_ST_DINTERLEAVE_16 ( ushort , vec_ushort8 )
VSX_IMPL_ST_DINTERLEAVE_16 ( short , vec_short8 )
VSX_IMPL_ST_DINTERLEAVE_16 ( short , vec_short8 )
// 2 and 4 channels deinterleave for 4 lanes
# define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \
@ -777,8 +870,8 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
c = vec_mergeh ( m0 , m1 ) ; \
d = vec_mergel ( m0 , m1 ) ; \
}
VSX_IMPL_ST_DINTERLEAVE_32 ( uint , vec_uint4 )
VSX_IMPL_ST_DINTERLEAVE_32 ( int , vec_int4 )
VSX_IMPL_ST_DINTERLEAVE_32 ( uint , vec_uint4 )
VSX_IMPL_ST_DINTERLEAVE_32 ( int , vec_int4 )
VSX_IMPL_ST_DINTERLEAVE_32 ( float , vec_float4 )
// 2 and 4 channels interleave and deinterleave for 2 lanes
@ -815,9 +908,9 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
c = vec_mergeh ( v0 , v1 ) ; \
d = vec_mergel ( v0 , v1 ) ; \
}
VSX_IMPL_ST_D_INTERLEAVE_64 ( int64 , vec_dword2 , vsx_ld2 , vsx_st2 )
VSX_IMPL_ST_D_INTERLEAVE_64 ( int64 , vec_dword2 , vsx_ld2 , vsx_st2 )
VSX_IMPL_ST_D_INTERLEAVE_64 ( uint64 , vec_udword2 , vsx_ld2 , vsx_st2 )
VSX_IMPL_ST_D_INTERLEAVE_64 ( double , vec_double2 , vsx_ld , vsx_st )
VSX_IMPL_ST_D_INTERLEAVE_64 ( double , vec_double2 , vsx_ld , vsx_st )
/* 3 channels */
# define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \
@ -882,7 +975,7 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)
c = vec_perm ( vec_perm ( v1 , v2 , c12_perm ) , v3 , c123_perm ) ; \
}
VSX_IMPL_ST_INTERLEAVE_3CH_8 ( ushort , vec_ushort8 )
VSX_IMPL_ST_INTERLEAVE_3CH_8 ( short , vec_short8 )
VSX_IMPL_ST_INTERLEAVE_3CH_8 ( short , vec_short8 )
# define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \
FORCE_INLINE ( void ) vec_st_interleave ( const Tvec & a , const Tvec & b , \
@ -907,8 +1000,8 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)
b = vec_perm ( v2 , vec_sld ( v1 , v3 , 8 ) , flp2 ) ; \
c = vec_perm ( vec_sld ( v2 , v1 , 8 ) , v3 , flp ) ; \
}
VSX_IMPL_ST_INTERLEAVE_3CH_4 ( uint , vec_uint4 )
VSX_IMPL_ST_INTERLEAVE_3CH_4 ( int , vec_int4 )
VSX_IMPL_ST_INTERLEAVE_3CH_4 ( uint , vec_uint4 )
VSX_IMPL_ST_INTERLEAVE_3CH_4 ( int , vec_int4 )
VSX_IMPL_ST_INTERLEAVE_3CH_4 ( float , vec_float4 )
# define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \
@ -929,9 +1022,9 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \
b = vec_permi ( v1 , v3 , 2 ) ; \
c = vec_permi ( v2 , v3 , 1 ) ; \
}
VSX_IMPL_ST_INTERLEAVE_3CH_2 ( int64 , vec_dword2 , vsx_ld2 , vsx_st2 )
VSX_IMPL_ST_INTERLEAVE_3CH_2 ( int64 , vec_dword2 , vsx_ld2 , vsx_st2 )
VSX_IMPL_ST_INTERLEAVE_3CH_2 ( uint64 , vec_udword2 , vsx_ld2 , vsx_st2 )
VSX_IMPL_ST_INTERLEAVE_3CH_2 ( double , vec_double2 , vsx_ld , vsx_st )
VSX_IMPL_ST_INTERLEAVE_3CH_2 ( double , vec_double2 , vsx_ld , vsx_st )
# endif // CV_VSX