Merge pull request #14099 from seiko2plus:vsx_improvements_3

pull/14110/head
Alexander Alekhin 6 years ago
commit 6c862fae13
  1. 115
      modules/core/include/opencv2/core/hal/intrin_vsx.hpp

@ -1077,100 +1077,129 @@ inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_loa
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
v_store_aligned(idx, idxvec);
const int idx[4] = {
vec_extract(idxvec.val, 0),
vec_extract(idxvec.val, 1),
vec_extract(idxvec.val, 2),
vec_extract(idxvec.val, 3)
};
return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
v_store_aligned(idx, idxvec);
const int idx[4] = {
vec_extract(idxvec.val, 0),
vec_extract(idxvec.val, 1),
vec_extract(idxvec.val, 2),
vec_extract(idxvec.val, 3)
};
return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
v_store_aligned(idx, idxvec);
const int idx[4] = {
vec_extract(idxvec.val, 0),
vec_extract(idxvec.val, 1),
vec_extract(idxvec.val, 2),
vec_extract(idxvec.val, 3)
};
return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
v_store_aligned(idx, idxvec);
const int idx[2] = {
vec_extract(idxvec.val, 0),
vec_extract(idxvec.val, 1)
};
return v_float64x2(tab[idx[0]], tab[idx[1]]);
}
inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
{
int CV_DECL_ALIGNED(32) idx[4];
v_store_aligned(idx, idxvec);
x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
vec_float4 xy0 = vec_ld_l8(tab + vec_extract(idxvec.val, 0));
vec_float4 xy1 = vec_ld_l8(tab + vec_extract(idxvec.val, 1));
vec_float4 xy2 = vec_ld_l8(tab + vec_extract(idxvec.val, 2));
vec_float4 xy3 = vec_ld_l8(tab + vec_extract(idxvec.val, 3));
vec_float4 xy02 = vec_mergeh(xy0, xy2); // x0, x2, y0, y2
vec_float4 xy13 = vec_mergeh(xy1, xy3); // x1, x3, y1, y3
x.val = vec_mergeh(xy02, xy13);
y.val = vec_mergel(xy02, xy13);
}
inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
{
int CV_DECL_ALIGNED(32) idx[4];
v_store_aligned(idx, idxvec);
x = v_float64x2(tab[idx[0]], tab[idx[1]]);
y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
vec_double2 xy0 = vsx_ld(vec_extract(idxvec.val, 0), tab);
vec_double2 xy1 = vsx_ld(vec_extract(idxvec.val, 1), tab);
x.val = vec_mergeh(xy0, xy1);
y.val = vec_mergel(xy0, xy1);
}
inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
{
vec_short8 vec0 = vec_mergeh((vec_short8)vec.val, (vec_short8)vec_mergesql(vec.val, vec.val));
vec0 = vec_mergeh(vec0, vec_mergesql(vec0, vec0));
return v_int8x16(vec_mergeh((vec_char16)vec0, (vec_char16)vec_mergesql(vec0, vec0)));
static const vec_uchar16 perm = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15};
return v_int8x16(vec_perm(vec.val, vec.val, perm));
}
inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
{
vec_char16 vec0 = (vec_char16)vec_mergeh((vec_int4)vec.val, (vec_int4)vec_mergesql(vec.val, vec.val));
return v_int8x16(vec_mergeh(vec0, vec_mergesql(vec0, vec0)));
static const vec_uchar16 perm = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
return v_int8x16(vec_perm(vec.val, vec.val, perm));
}
inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
{
vec_short8 vec0 = (vec_short8)vec_mergeh((vec_int4)vec.val, (vec_int4)vec_mergesql(vec.val, vec.val));
return v_int16x8(vec_mergeh(vec0, vec_mergesql(vec0, vec0)));
static const vec_uchar16 perm = {0,1, 4,5, 2,3, 6,7, 8,9, 12,13, 10,11, 14,15};
return v_int16x8(vec_perm(vec.val, vec.val, perm));
}
inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
{
return v_int16x8(vec_mergeh(vec.val, vec_mergesql(vec.val, vec.val)));
static const vec_uchar16 perm = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
return v_int16x8(vec_perm(vec.val, vec.val, perm));
}
inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
{
return v_int32x4(vec_mergeh(vec.val, vec_mergesql(vec.val, vec.val)));
static const vec_uchar16 perm = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15};
return v_int32x4(vec_perm(vec.val, vec.val, perm));
}
inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
{
schar CV_DECL_ALIGNED(32) val[16];
v_store_aligned(val, vec);
return v_int8x16(val[0], val[1], val[2], val[4], val[5], val[6], val[8], val[9], val[10], val[12], val[13], val[14], val[15], val[15], val[15], val[15]);
static const vec_uchar16 perm = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 15, 15, 15};
return v_int8x16(vec_perm(vec.val, vec.val, perm));
}
inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
{
short CV_DECL_ALIGNED(32) val[8];
v_store_aligned(val, vec);
return v_int16x8(val[0], val[1], val[2], val[4], val[5], val[6], val[7], val[7]);
static const vec_uchar16 perm = {0,1, 2,3, 4,5, 8,9, 10,11, 12,13, 14,15, 14,15};
return v_int16x8(vec_perm(vec.val, vec.val, perm));
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
inline v_int32x4 v_pack_triplets(const v_int32x4& vec)
{ return vec; }
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec)
{ return vec; }
inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
{ return vec; }
/////// FP16 support ////////

Loading…
Cancel
Save