|
|
|
@ -1661,6 +1661,198 @@ YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422) |
|
|
|
|
YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422) |
|
|
|
|
YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422) |
|
|
|
|
|
|
|
|
|
static void hyscale_fast_vsx(SwsContext *c, int16_t *dst, int dstWidth, |
|
|
|
|
const uint8_t *src, int srcW, int xInc) |
|
|
|
|
{ |
|
|
|
|
int i; |
|
|
|
|
unsigned int xpos = 0, xx; |
|
|
|
|
vector uint8_t vin, vin2, vperm; |
|
|
|
|
vector int8_t vmul, valpha; |
|
|
|
|
vector int16_t vtmp, vtmp2, vtmp3, vtmp4; |
|
|
|
|
vector uint16_t vd_l, vd_r, vcoord16[2]; |
|
|
|
|
vector uint32_t vcoord[4]; |
|
|
|
|
const vector uint32_t vadd = (vector uint32_t) { |
|
|
|
|
0, |
|
|
|
|
xInc * 1, |
|
|
|
|
xInc * 2, |
|
|
|
|
xInc * 3, |
|
|
|
|
}; |
|
|
|
|
const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math
|
|
|
|
|
0, |
|
|
|
|
xInc * 1, |
|
|
|
|
xInc * 2, |
|
|
|
|
xInc * 3, |
|
|
|
|
xInc * 4, |
|
|
|
|
xInc * 5, |
|
|
|
|
xInc * 6, |
|
|
|
|
xInc * 7, |
|
|
|
|
}; |
|
|
|
|
const vector uint32_t vshift16 = vec_splats((uint32_t) 16); |
|
|
|
|
const vector uint16_t vshift9 = vec_splat_u16(9); |
|
|
|
|
const vector uint8_t vzero = vec_splat_u8(0); |
|
|
|
|
const vector uint16_t vshift = vec_splat_u16(7); |
|
|
|
|
|
|
|
|
|
for (i = 0; i < dstWidth; i += 16) { |
|
|
|
|
vcoord16[0] = vec_splats((uint16_t) xpos); |
|
|
|
|
vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8)); |
|
|
|
|
|
|
|
|
|
vcoord16[0] = vec_add(vcoord16[0], vadd16); |
|
|
|
|
vcoord16[1] = vec_add(vcoord16[1], vadd16); |
|
|
|
|
|
|
|
|
|
vcoord16[0] = vec_sr(vcoord16[0], vshift9); |
|
|
|
|
vcoord16[1] = vec_sr(vcoord16[1], vshift9); |
|
|
|
|
valpha = (vector int8_t) vec_pack(vcoord16[0], vcoord16[1]); |
|
|
|
|
|
|
|
|
|
xx = xpos >> 16; |
|
|
|
|
vin = vec_vsx_ld(0, &src[xx]); |
|
|
|
|
|
|
|
|
|
vcoord[0] = vec_splats(xpos & 0xffff); |
|
|
|
|
vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4); |
|
|
|
|
vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8); |
|
|
|
|
vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12); |
|
|
|
|
|
|
|
|
|
vcoord[0] = vec_add(vcoord[0], vadd); |
|
|
|
|
vcoord[1] = vec_add(vcoord[1], vadd); |
|
|
|
|
vcoord[2] = vec_add(vcoord[2], vadd); |
|
|
|
|
vcoord[3] = vec_add(vcoord[3], vadd); |
|
|
|
|
|
|
|
|
|
vcoord[0] = vec_sr(vcoord[0], vshift16); |
|
|
|
|
vcoord[1] = vec_sr(vcoord[1], vshift16); |
|
|
|
|
vcoord[2] = vec_sr(vcoord[2], vshift16); |
|
|
|
|
vcoord[3] = vec_sr(vcoord[3], vshift16); |
|
|
|
|
|
|
|
|
|
vcoord16[0] = vec_pack(vcoord[0], vcoord[1]); |
|
|
|
|
vcoord16[1] = vec_pack(vcoord[2], vcoord[3]); |
|
|
|
|
vperm = vec_pack(vcoord16[0], vcoord16[1]); |
|
|
|
|
|
|
|
|
|
vin = vec_perm(vin, vin, vperm); |
|
|
|
|
|
|
|
|
|
vin2 = vec_vsx_ld(1, &src[xx]); |
|
|
|
|
vin2 = vec_perm(vin2, vin2, vperm); |
|
|
|
|
|
|
|
|
|
vmul = (vector int8_t) vec_sub(vin2, vin); |
|
|
|
|
vtmp = vec_mule(vmul, valpha); |
|
|
|
|
vtmp2 = vec_mulo(vmul, valpha); |
|
|
|
|
vtmp3 = vec_mergeh(vtmp, vtmp2); |
|
|
|
|
vtmp4 = vec_mergel(vtmp, vtmp2); |
|
|
|
|
|
|
|
|
|
vd_l = (vector uint16_t) vec_mergeh(vin, vzero); |
|
|
|
|
vd_r = (vector uint16_t) vec_mergel(vin, vzero); |
|
|
|
|
vd_l = vec_sl(vd_l, vshift); |
|
|
|
|
vd_r = vec_sl(vd_r, vshift); |
|
|
|
|
|
|
|
|
|
vd_l = vec_add(vd_l, (vector uint16_t) vtmp3); |
|
|
|
|
vd_r = vec_add(vd_r, (vector uint16_t) vtmp4); |
|
|
|
|
|
|
|
|
|
vec_st((vector int16_t) vd_l, 0, &dst[i]); |
|
|
|
|
vec_st((vector int16_t) vd_r, 0, &dst[i + 8]); |
|
|
|
|
|
|
|
|
|
xpos += xInc * 16; |
|
|
|
|
} |
|
|
|
|
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
|
|
|
|
dst[i] = src[srcW-1]*128; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define HCSCALE(in, out) \ |
|
|
|
|
vin = vec_vsx_ld(0, &in[xx]); \
|
|
|
|
|
vin = vec_perm(vin, vin, vperm); \
|
|
|
|
|
\
|
|
|
|
|
vin2 = vec_vsx_ld(1, &in[xx]); \
|
|
|
|
|
vin2 = vec_perm(vin2, vin2, vperm); \
|
|
|
|
|
\
|
|
|
|
|
vtmp = vec_mule(vin, valphaxor); \
|
|
|
|
|
vtmp2 = vec_mulo(vin, valphaxor); \
|
|
|
|
|
vtmp3 = vec_mergeh(vtmp, vtmp2); \
|
|
|
|
|
vtmp4 = vec_mergel(vtmp, vtmp2); \
|
|
|
|
|
\
|
|
|
|
|
vtmp = vec_mule(vin2, valpha); \
|
|
|
|
|
vtmp2 = vec_mulo(vin2, valpha); \
|
|
|
|
|
vd_l = vec_mergeh(vtmp, vtmp2); \
|
|
|
|
|
vd_r = vec_mergel(vtmp, vtmp2); \
|
|
|
|
|
\
|
|
|
|
|
vd_l = vec_add(vd_l, vtmp3); \
|
|
|
|
|
vd_r = vec_add(vd_r, vtmp4); \
|
|
|
|
|
\
|
|
|
|
|
vec_st((vector int16_t) vd_l, 0, &out[i]); \
|
|
|
|
|
vec_st((vector int16_t) vd_r, 0, &out[i + 8]) |
|
|
|
|
|
|
|
|
|
static void hcscale_fast_vsx(SwsContext *c, int16_t *dst1, int16_t *dst2, |
|
|
|
|
int dstWidth, const uint8_t *src1, |
|
|
|
|
const uint8_t *src2, int srcW, int xInc) |
|
|
|
|
{ |
|
|
|
|
int i; |
|
|
|
|
unsigned int xpos = 0, xx; |
|
|
|
|
vector uint8_t vin, vin2, vperm; |
|
|
|
|
vector uint8_t valpha, valphaxor; |
|
|
|
|
vector uint16_t vtmp, vtmp2, vtmp3, vtmp4; |
|
|
|
|
vector uint16_t vd_l, vd_r, vcoord16[2]; |
|
|
|
|
vector uint32_t vcoord[4]; |
|
|
|
|
const vector uint8_t vxor = vec_splats((uint8_t) 127); |
|
|
|
|
const vector uint32_t vadd = (vector uint32_t) { |
|
|
|
|
0, |
|
|
|
|
xInc * 1, |
|
|
|
|
xInc * 2, |
|
|
|
|
xInc * 3, |
|
|
|
|
}; |
|
|
|
|
const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math
|
|
|
|
|
0, |
|
|
|
|
xInc * 1, |
|
|
|
|
xInc * 2, |
|
|
|
|
xInc * 3, |
|
|
|
|
xInc * 4, |
|
|
|
|
xInc * 5, |
|
|
|
|
xInc * 6, |
|
|
|
|
xInc * 7, |
|
|
|
|
}; |
|
|
|
|
const vector uint32_t vshift16 = vec_splats((uint32_t) 16); |
|
|
|
|
const vector uint16_t vshift9 = vec_splat_u16(9); |
|
|
|
|
|
|
|
|
|
for (i = 0; i < dstWidth; i += 16) { |
|
|
|
|
vcoord16[0] = vec_splats((uint16_t) xpos); |
|
|
|
|
vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8)); |
|
|
|
|
|
|
|
|
|
vcoord16[0] = vec_add(vcoord16[0], vadd16); |
|
|
|
|
vcoord16[1] = vec_add(vcoord16[1], vadd16); |
|
|
|
|
|
|
|
|
|
vcoord16[0] = vec_sr(vcoord16[0], vshift9); |
|
|
|
|
vcoord16[1] = vec_sr(vcoord16[1], vshift9); |
|
|
|
|
valpha = vec_pack(vcoord16[0], vcoord16[1]); |
|
|
|
|
valphaxor = vec_xor(valpha, vxor); |
|
|
|
|
|
|
|
|
|
xx = xpos >> 16; |
|
|
|
|
|
|
|
|
|
vcoord[0] = vec_splats(xpos & 0xffff); |
|
|
|
|
vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4); |
|
|
|
|
vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8); |
|
|
|
|
vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12); |
|
|
|
|
|
|
|
|
|
vcoord[0] = vec_add(vcoord[0], vadd); |
|
|
|
|
vcoord[1] = vec_add(vcoord[1], vadd); |
|
|
|
|
vcoord[2] = vec_add(vcoord[2], vadd); |
|
|
|
|
vcoord[3] = vec_add(vcoord[3], vadd); |
|
|
|
|
|
|
|
|
|
vcoord[0] = vec_sr(vcoord[0], vshift16); |
|
|
|
|
vcoord[1] = vec_sr(vcoord[1], vshift16); |
|
|
|
|
vcoord[2] = vec_sr(vcoord[2], vshift16); |
|
|
|
|
vcoord[3] = vec_sr(vcoord[3], vshift16); |
|
|
|
|
|
|
|
|
|
vcoord16[0] = vec_pack(vcoord[0], vcoord[1]); |
|
|
|
|
vcoord16[1] = vec_pack(vcoord[2], vcoord[3]); |
|
|
|
|
vperm = vec_pack(vcoord16[0], vcoord16[1]); |
|
|
|
|
|
|
|
|
|
HCSCALE(src1, dst1); |
|
|
|
|
HCSCALE(src2, dst2); |
|
|
|
|
|
|
|
|
|
xpos += xInc * 16; |
|
|
|
|
} |
|
|
|
|
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { |
|
|
|
|
dst1[i] = src1[srcW-1]*128; |
|
|
|
|
dst2[i] = src2[srcW-1]*128; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#undef HCSCALE |
|
|
|
|
|
|
|
|
|
#endif /* !HAVE_BIGENDIAN */ |
|
|
|
|
|
|
|
|
|
#endif /* HAVE_VSX */ |
|
|
|
@ -1677,6 +1869,10 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) |
|
|
|
|
#if !HAVE_BIGENDIAN |
|
|
|
|
if (c->srcBpc == 8 && c->dstBpc <= 14) { |
|
|
|
|
c->hyScale = c->hcScale = hScale_real_vsx; |
|
|
|
|
if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) { |
|
|
|
|
c->hyscale_fast = hyscale_fast_vsx; |
|
|
|
|
c->hcscale_fast = hcscale_fast_vsx; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && |
|
|
|
|
dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 && |
|
|
|
|