|
|
|
@ -12,6 +12,7 @@ |
|
|
|
|
//
|
|
|
|
|
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
|
|
|
|
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
|
|
|
|
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
|
|
|
|
// Third party copyrights are property of their respective owners.
|
|
|
|
|
//
|
|
|
|
|
// Redistribution and use in source and binary forms, with or without modification,
|
|
|
|
@ -62,8 +63,11 @@ template<typename T> struct VSplit4; |
|
|
|
|
|
|
|
|
|
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ |
|
|
|
|
template<> \
|
|
|
|
|
struct name<data_type>{ \
|
|
|
|
|
void operator()(const data_type* src, data_type* dst0, data_type* dst1){ \
|
|
|
|
|
struct name<data_type> \
|
|
|
|
|
{ \
|
|
|
|
|
void operator()(const data_type* src, data_type* dst0, \
|
|
|
|
|
data_type* dst1) const \
|
|
|
|
|
{ \
|
|
|
|
|
reg_type r = load_func(src); \
|
|
|
|
|
store_func(dst0, r.val[0]); \
|
|
|
|
|
store_func(dst1, r.val[1]); \
|
|
|
|
@ -72,9 +76,11 @@ template<typename T> struct VSplit4; |
|
|
|
|
|
|
|
|
|
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ |
|
|
|
|
template<> \
|
|
|
|
|
struct name<data_type>{ \
|
|
|
|
|
struct name<data_type> \
|
|
|
|
|
{ \
|
|
|
|
|
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
|
|
|
|
|
data_type* dst2){ \
|
|
|
|
|
data_type* dst2) const \
|
|
|
|
|
{ \
|
|
|
|
|
reg_type r = load_func(src); \
|
|
|
|
|
store_func(dst0, r.val[0]); \
|
|
|
|
|
store_func(dst1, r.val[1]); \
|
|
|
|
@ -84,9 +90,11 @@ template<typename T> struct VSplit4; |
|
|
|
|
|
|
|
|
|
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ |
|
|
|
|
template<> \
|
|
|
|
|
struct name<data_type>{ \
|
|
|
|
|
struct name<data_type> \
|
|
|
|
|
{ \
|
|
|
|
|
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
|
|
|
|
|
data_type* dst2, data_type* dst3){ \
|
|
|
|
|
data_type* dst2, data_type* dst3) const \
|
|
|
|
|
{ \
|
|
|
|
|
reg_type r = load_func(src); \
|
|
|
|
|
store_func(dst0, r.val[0]); \
|
|
|
|
|
store_func(dst1, r.val[1]); \
|
|
|
|
@ -96,28 +104,174 @@ template<typename T> struct VSplit4; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); |
|
|
|
|
SPLIT2_KERNEL_TEMPLATE(VSplit2, schar , int8x16x2_t, vld2q_s8 , vst1q_s8 ); |
|
|
|
|
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); |
|
|
|
|
SPLIT2_KERNEL_TEMPLATE(VSplit2, short , int16x8x2_t, vld2q_s16, vst1q_s16); |
|
|
|
|
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); |
|
|
|
|
SPLIT2_KERNEL_TEMPLATE(VSplit2, float , float32x4x2_t, vld2q_f32, vst1q_f32); |
|
|
|
|
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); |
|
|
|
|
|
|
|
|
|
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); |
|
|
|
|
SPLIT3_KERNEL_TEMPLATE(VSplit3, schar , int8x16x3_t, vld3q_s8 , vst1q_s8 ); |
|
|
|
|
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); |
|
|
|
|
SPLIT3_KERNEL_TEMPLATE(VSplit3, short , int16x8x3_t, vld3q_s16, vst1q_s16); |
|
|
|
|
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); |
|
|
|
|
SPLIT3_KERNEL_TEMPLATE(VSplit3, float , float32x4x3_t, vld3q_f32, vst1q_f32); |
|
|
|
|
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); |
|
|
|
|
|
|
|
|
|
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); |
|
|
|
|
SPLIT4_KERNEL_TEMPLATE(VSplit4, schar , int8x16x4_t, vld4q_s8 , vst1q_s8 ); |
|
|
|
|
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); |
|
|
|
|
SPLIT4_KERNEL_TEMPLATE(VSplit4, short , int16x8x4_t, vld4q_s16, vst1q_s16); |
|
|
|
|
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); |
|
|
|
|
SPLIT4_KERNEL_TEMPLATE(VSplit4, float , float32x4x4_t, vld4q_f32, vst1q_f32); |
|
|
|
|
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); |
|
|
|
|
|
|
|
|
|
#elif CV_SSE2 |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
struct VSplit2 |
|
|
|
|
{ |
|
|
|
|
VSplit2() : support(false) { } |
|
|
|
|
void operator()(const T *, T *, T *) const { } |
|
|
|
|
|
|
|
|
|
bool support; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
struct VSplit3 |
|
|
|
|
{ |
|
|
|
|
VSplit3() : support(false) { } |
|
|
|
|
void operator()(const T *, T *, T *, T *) const { } |
|
|
|
|
|
|
|
|
|
bool support; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
struct VSplit4 |
|
|
|
|
{ |
|
|
|
|
VSplit4() : support(false) { } |
|
|
|
|
void operator()(const T *, T *, T *, T *, T *) const { } |
|
|
|
|
|
|
|
|
|
bool support; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ |
|
|
|
|
template <> \
|
|
|
|
|
struct VSplit2<data_type> \
|
|
|
|
|
{ \
|
|
|
|
|
enum \ |
|
|
|
|
{ \
|
|
|
|
|
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
|
|
|
|
}; \
|
|
|
|
|
\
|
|
|
|
|
VSplit2() \
|
|
|
|
|
{ \
|
|
|
|
|
support = true; \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
void operator()(const data_type * src, \
|
|
|
|
|
data_type * dst0, data_type * dst1) const \
|
|
|
|
|
{ \
|
|
|
|
|
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
|
|
|
|
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
|
|
|
|
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
|
|
|
|
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
|
|
|
|
\
|
|
|
|
|
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
|
|
|
|
|
\
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
bool support; \
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ |
|
|
|
|
template <> \
|
|
|
|
|
struct VSplit3<data_type> \
|
|
|
|
|
{ \
|
|
|
|
|
enum \ |
|
|
|
|
{ \
|
|
|
|
|
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
|
|
|
|
}; \
|
|
|
|
|
\
|
|
|
|
|
VSplit3() \
|
|
|
|
|
{ \
|
|
|
|
|
support = true; \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
void operator()(const data_type * src, \
|
|
|
|
|
data_type * dst0, data_type * dst1, data_type * dst2) const \
|
|
|
|
|
{ \
|
|
|
|
|
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
|
|
|
|
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
|
|
|
|
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
|
|
|
|
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
|
|
|
|
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
|
|
|
|
|
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
|
|
|
|
|
\
|
|
|
|
|
_mm_deinterleave(v_src0, v_src1, v_src2, \
|
|
|
|
|
v_src3, v_src4, v_src5); \
|
|
|
|
|
\
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
bool support; \
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ |
|
|
|
|
template <> \
|
|
|
|
|
struct VSplit4<data_type> \
|
|
|
|
|
{ \
|
|
|
|
|
enum \ |
|
|
|
|
{ \
|
|
|
|
|
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
|
|
|
|
}; \
|
|
|
|
|
\
|
|
|
|
|
VSplit4() \
|
|
|
|
|
{ \
|
|
|
|
|
support = true; \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
|
|
|
|
|
data_type * dst2, data_type * dst3) const \
|
|
|
|
|
{ \
|
|
|
|
|
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
|
|
|
|
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
|
|
|
|
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
|
|
|
|
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
|
|
|
|
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
|
|
|
|
|
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
|
|
|
|
|
reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
|
|
|
|
|
reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
|
|
|
|
|
\
|
|
|
|
|
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
|
|
|
|
|
v_src4, v_src5, v_src6, v_src7); \
|
|
|
|
|
\
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
|
|
|
|
|
_mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
|
|
|
|
|
} \
|
|
|
|
|
\
|
|
|
|
|
bool support; \
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); |
|
|
|
|
SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); |
|
|
|
|
SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); |
|
|
|
|
|
|
|
|
|
SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); |
|
|
|
|
SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); |
|
|
|
|
SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); |
|
|
|
|
|
|
|
|
|
SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); |
|
|
|
|
SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); |
|
|
|
|
SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
template<typename T> static void |
|
|
|
@ -154,6 +308,19 @@ split_( const T* src, T** dst, int len, int cn ) |
|
|
|
|
for( ; i < len - inc_i; i += inc_i, j += inc_j) |
|
|
|
|
vsplit(src + j, dst0 + i, dst1 + i); |
|
|
|
|
} |
|
|
|
|
#elif CV_SSE2 |
|
|
|
|
if (cn == 2) |
|
|
|
|
{ |
|
|
|
|
int inc_i = 32/sizeof(T); |
|
|
|
|
int inc_j = 2 * inc_i; |
|
|
|
|
|
|
|
|
|
VSplit2<T> vsplit; |
|
|
|
|
if (vsplit.support) |
|
|
|
|
{ |
|
|
|
|
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
|
|
|
|
vsplit(src + j, dst0 + i, dst1 + i); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; i < len; i++, j += cn ) |
|
|
|
|
{ |
|
|
|
@ -176,6 +343,20 @@ split_( const T* src, T** dst, int len, int cn ) |
|
|
|
|
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
|
|
|
|
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); |
|
|
|
|
} |
|
|
|
|
#elif CV_SSE2 |
|
|
|
|
if (cn == 3) |
|
|
|
|
{ |
|
|
|
|
int inc_i = 32/sizeof(T); |
|
|
|
|
int inc_j = 3 * inc_i; |
|
|
|
|
|
|
|
|
|
VSplit3<T> vsplit; |
|
|
|
|
|
|
|
|
|
if (vsplit.support) |
|
|
|
|
{ |
|
|
|
|
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
|
|
|
|
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; i < len; i++, j += cn ) |
|
|
|
|
{ |
|
|
|
@ -199,6 +380,19 @@ split_( const T* src, T** dst, int len, int cn ) |
|
|
|
|
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
|
|
|
|
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); |
|
|
|
|
} |
|
|
|
|
#elif CV_SSE2 |
|
|
|
|
if (cn == 4) |
|
|
|
|
{ |
|
|
|
|
int inc_i = 32/sizeof(T); |
|
|
|
|
int inc_j = 4 * inc_i; |
|
|
|
|
|
|
|
|
|
VSplit4<T> vsplit; |
|
|
|
|
if (vsplit.support) |
|
|
|
|
{ |
|
|
|
|
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
|
|
|
|
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for( ; i < len; i++, j += cn ) |
|
|
|
|
{ |
|
|
|
@ -265,27 +459,18 @@ template<typename T> struct VMerge4; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); |
|
|
|
|
MERGE2_KERNEL_TEMPLATE(VMerge2, schar , int8x16x2_t, vld1q_s8 , vst2q_s8 ); |
|
|
|
|
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); |
|
|
|
|
MERGE2_KERNEL_TEMPLATE(VMerge2, short , int16x8x2_t, vld1q_s16, vst2q_s16); |
|
|
|
|
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); |
|
|
|
|
MERGE2_KERNEL_TEMPLATE(VMerge2, float , float32x4x2_t, vld1q_f32, vst2q_f32); |
|
|
|
|
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); |
|
|
|
|
|
|
|
|
|
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); |
|
|
|
|
MERGE3_KERNEL_TEMPLATE(VMerge3, schar , int8x16x3_t, vld1q_s8 , vst3q_s8 ); |
|
|
|
|
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); |
|
|
|
|
MERGE3_KERNEL_TEMPLATE(VMerge3, short , int16x8x3_t, vld1q_s16, vst3q_s16); |
|
|
|
|
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); |
|
|
|
|
MERGE3_KERNEL_TEMPLATE(VMerge3, float , float32x4x3_t, vld1q_f32, vst3q_f32); |
|
|
|
|
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); |
|
|
|
|
|
|
|
|
|
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); |
|
|
|
|
MERGE4_KERNEL_TEMPLATE(VMerge4, schar , int8x16x4_t, vld1q_s8 , vst4q_s8 ); |
|
|
|
|
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); |
|
|
|
|
MERGE4_KERNEL_TEMPLATE(VMerge4, short , int16x8x4_t, vld1q_s16, vst4q_s16); |
|
|
|
|
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); |
|
|
|
|
MERGE4_KERNEL_TEMPLATE(VMerge4, float , float32x4x4_t, vld1q_f32, vst4q_f32); |
|
|
|
|
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|