@ -25,11 +25,13 @@ static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
const uint8_t * ref , int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 2 ;
int res = ( height & 0x03 ) ;
v16u8 src0 , src1 , src2 , src3 , ref0 , ref1 , ref2 , ref3 ;
v8u16 zero = { 0 } ;
v8u16 sad = { 0 } ;
for ( ht_cnt = ( height > > 2 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
LD_UB4 ( src , src_stride , src0 , src1 , src2 , src3 ) ;
src + = ( 4 * src_stride ) ;
LD_UB4 ( ref , ref_stride , ref0 , ref1 , ref2 , ref3 ) ;
@ -39,6 +41,16 @@ static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
src0 , src1 , ref0 , ref1 ) ;
sad + = SAD_UB2_UH ( src0 , src1 , ref0 , ref1 ) ;
}
for ( ; res - - ; ) {
v16u8 diff ;
src0 = LD_UB ( src ) ;
ref0 = LD_UB ( ref ) ;
src + = src_stride ;
ref + = ref_stride ;
diff = __msa_asub_u_b ( ( v16u8 ) src0 , ( v16u8 ) ref0 ) ;
diff = ( v16u8 ) __msa_ilvr_d ( ( v2i64 ) zero , ( v2i64 ) diff ) ;
sad + = __msa_hadd_u_h ( ( v16u8 ) diff , ( v16u8 ) diff ) ;
}
return ( HADD_UH_U32 ( sad ) ) ;
}
@ -47,11 +59,12 @@ static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
const uint8_t * ref , int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 2 ;
int res = ( height & 0x03 ) ;
v16u8 src0 , src1 , ref0 , ref1 ;
v8u16 sad = { 0 } ;
for ( ht_cnt = ( height > > 2 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
LD_UB2 ( src , src_stride , src0 , src1 ) ;
src + = ( 2 * src_stride ) ;
LD_UB2 ( ref , ref_stride , ref0 , ref1 ) ;
@ -64,7 +77,15 @@ static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
ref + = ( 2 * ref_stride ) ;
sad + = SAD_UB2_UH ( src0 , src1 , ref0 , ref1 ) ;
}
for ( ; res > 0 ; res - - ) {
v16u8 diff ;
src0 = LD_UB ( src ) ;
ref0 = LD_UB ( ref ) ;
src + = src_stride ;
ref + = ref_stride ;
diff = __msa_asub_u_b ( ( v16u8 ) src0 , ( v16u8 ) ref0 ) ;
sad + = __msa_hadd_u_h ( ( v16u8 ) diff , ( v16u8 ) diff ) ;
}
return ( HADD_UH_U32 ( sad ) ) ;
}
@ -74,12 +95,14 @@ static uint32_t sad_horiz_bilinear_filter_8width_msa(const uint8_t *src,
int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 3 ;
int32_t res = height & 0x07 ;
v16u8 src0 , src1 , src2 , src3 , comp0 , comp1 ;
v16u8 ref0 , ref1 , ref2 , ref3 , ref4 , ref5 ;
v8u16 zero = { 0 } ;
v8u16 sad = { 0 } ;
for ( ht_cnt = ( height > > 3 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
LD_UB4 ( src , src_stride , src0 , src1 , src2 , src3 ) ;
src + = ( 4 * src_stride ) ;
LD_UB4 ( ref , ref_stride , ref0 , ref1 , ref2 , ref3 ) ;
@ -107,6 +130,18 @@ static uint32_t sad_horiz_bilinear_filter_8width_msa(const uint8_t *src,
sad + = SAD_UB2_UH ( src0 , src1 , comp0 , comp1 ) ;
}
for ( ; res - - ; ) {
v16u8 diff ;
src0 = LD_UB ( src ) ;
ref0 = LD_UB ( ref ) ;
ref1 = LD_UB ( ref + 1 ) ;
src + = src_stride ;
ref + = ref_stride ;
comp0 = ( v16u8 ) __msa_aver_u_b ( ( v16u8 ) ref0 , ( v16u8 ) ref1 ) ;
diff = __msa_asub_u_b ( ( v16u8 ) src0 , ( v16u8 ) comp0 ) ;
diff = ( v16u8 ) __msa_ilvr_d ( ( v2i64 ) zero , ( v2i64 ) diff ) ;
sad + = __msa_hadd_u_h ( ( v16u8 ) diff , ( v16u8 ) diff ) ;
}
return ( HADD_UH_U32 ( sad ) ) ;
}
@ -116,12 +151,13 @@ static uint32_t sad_horiz_bilinear_filter_16width_msa(const uint8_t *src,
int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 3 ;
int32_t res = height & 0x07 ;
v16u8 src0 , src1 , src2 , src3 , comp0 , comp1 ;
v16u8 ref00 , ref10 , ref20 , ref30 , ref01 , ref11 , ref21 , ref31 ;
v8u16 sad = { 0 } ;
for ( ht_cnt = ( height > > 3 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
LD_UB4 ( src , src_stride , src0 , src1 , src2 , src3 ) ;
src + = ( 4 * src_stride ) ;
LD_UB4 ( ref , ref_stride , ref00 , ref10 , ref20 , ref30 ) ;
@ -145,6 +181,17 @@ static uint32_t sad_horiz_bilinear_filter_16width_msa(const uint8_t *src,
sad + = SAD_UB2_UH ( src2 , src3 , comp0 , comp1 ) ;
}
for ( ; res - - ; ) {
v16u8 diff ;
src0 = LD_UB ( src ) ;
ref00 = LD_UB ( ref ) ;
ref01 = LD_UB ( ref + 1 ) ;
src + = src_stride ;
ref + = ref_stride ;
comp0 = ( v16u8 ) __msa_aver_u_b ( ( v16u8 ) ref00 , ( v16u8 ) ref01 ) ;
diff = __msa_asub_u_b ( ( v16u8 ) src0 , ( v16u8 ) comp0 ) ;
sad + = __msa_hadd_u_h ( ( v16u8 ) diff , ( v16u8 ) diff ) ;
}
return ( HADD_UH_U32 ( sad ) ) ;
}
@ -154,12 +201,14 @@ static uint32_t sad_vert_bilinear_filter_8width_msa(const uint8_t *src,
int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 3 ;
int32_t res = height & 0x07 ;
v16u8 src0 , src1 , src2 , src3 , comp0 , comp1 ;
v16u8 ref0 , ref1 , ref2 , ref3 , ref4 ;
v8u16 zero = { 0 } ;
v8u16 sad = { 0 } ;
for ( ht_cnt = ( height > > 3 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
LD_UB4 ( src , src_stride , src0 , src1 , src2 , src3 ) ;
src + = ( 4 * src_stride ) ;
LD_UB5 ( ref , ref_stride , ref0 , ref1 , ref2 , ref3 , ref4 ) ;
@ -183,6 +232,17 @@ static uint32_t sad_vert_bilinear_filter_8width_msa(const uint8_t *src,
sad + = SAD_UB2_UH ( src0 , src1 , comp0 , comp1 ) ;
}
for ( ; res - - ; ) {
v16u8 diff ;
src0 = LD_UB ( src ) ;
LD_UB2 ( ref , ref_stride , ref0 , ref1 ) ;
src + = src_stride ;
ref + = ref_stride ;
comp0 = ( v16u8 ) __msa_aver_u_b ( ( v16u8 ) ref0 , ( v16u8 ) ref1 ) ;
diff = __msa_asub_u_b ( ( v16u8 ) src0 , ( v16u8 ) comp0 ) ;
diff = ( v16u8 ) __msa_ilvr_d ( ( v2i64 ) zero , ( v2i64 ) diff ) ;
sad + = __msa_hadd_u_h ( ( v16u8 ) diff , ( v16u8 ) diff ) ;
}
return ( HADD_UH_U32 ( sad ) ) ;
}
@ -192,12 +252,13 @@ static uint32_t sad_vert_bilinear_filter_16width_msa(const uint8_t *src,
int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 3 ;
int32_t res = height & 0x07 ;
v16u8 src0 , src1 , src2 , src3 , comp0 , comp1 ;
v16u8 ref0 , ref1 , ref2 , ref3 , ref4 ;
v8u16 sad = { 0 } ;
for ( ht_cnt = ( height > > 3 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
LD_UB5 ( ref , ref_stride , ref4 , ref0 , ref1 , ref2 , ref3 ) ;
ref + = ( 5 * ref_stride ) ;
LD_UB4 ( src , src_stride , src0 , src1 , src2 , src3 ) ;
@ -221,6 +282,16 @@ static uint32_t sad_vert_bilinear_filter_16width_msa(const uint8_t *src,
sad + = SAD_UB2_UH ( src2 , src3 , comp0 , comp1 ) ;
}
for ( ; res - - ; ) {
v16u8 diff ;
src0 = LD_UB ( src ) ;
LD_UB2 ( ref , ref_stride , ref0 , ref1 ) ;
src + = src_stride ;
ref + = ref_stride ;
comp0 = ( v16u8 ) __msa_aver_u_b ( ( v16u8 ) ref0 , ( v16u8 ) ref1 ) ;
diff = __msa_asub_u_b ( ( v16u8 ) src0 , ( v16u8 ) comp0 ) ;
sad + = __msa_hadd_u_h ( ( v16u8 ) diff , ( v16u8 ) diff ) ;
}
return ( HADD_UH_U32 ( sad ) ) ;
}
@ -230,11 +301,13 @@ static uint32_t sad_hv_bilinear_filter_8width_msa(const uint8_t *src,
int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 2 ;
int32_t res = height & 0x03 ;
v16u8 src0 , src1 , src2 , src3 , temp0 , temp1 , diff ;
v16u8 ref0 , ref1 , ref2 , ref3 , ref4 ;
v16i8 mask = { 0 , 1 , 1 , 2 , 2 , 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , 7 , 7 , 8 } ;
v8u16 comp0 , comp1 , comp2 , comp3 ;
v8u16 zero = { 0 } ;
v8u16 sad = { 0 } ;
for ( ht_cnt = ( height > > 2 ) ; ht_cnt - - ; ) {
@ -277,6 +350,22 @@ static uint32_t sad_hv_bilinear_filter_8width_msa(const uint8_t *src,
sad + = __msa_hadd_u_h ( diff , diff ) ;
}
for ( ; res - - ; ) {
src0 = LD_UB ( src ) ;
LD_UB2 ( ref , ref_stride , ref0 , ref1 ) ;
temp0 = ( v16u8 ) __msa_vshf_b ( mask , ( v16i8 ) ref0 , ( v16i8 ) ref0 ) ;
temp1 = ( v16u8 ) __msa_vshf_b ( mask , ( v16i8 ) ref1 , ( v16i8 ) ref1 ) ;
src + = src_stride ;
ref + = ref_stride ;
comp0 = __msa_hadd_u_h ( temp0 , temp0 ) ;
comp2 = __msa_hadd_u_h ( temp1 , temp1 ) ;
comp2 + = comp0 ;
comp2 = ( v8u16 ) __msa_srari_h ( ( v8i16 ) comp2 , 2 ) ;
comp0 = ( v16u8 ) __msa_pckev_b ( ( v16i8 ) zero , ( v16i8 ) comp2 ) ;
diff = __msa_asub_u_b ( src0 , comp0 ) ;
diff = ( v16u8 ) __msa_ilvr_d ( ( v2i64 ) zero , ( v2i64 ) diff ) ;
sad + = __msa_hadd_u_h ( diff , diff ) ;
}
return ( HADD_UH_U32 ( sad ) ) ;
}
@ -286,14 +375,15 @@ static uint32_t sad_hv_bilinear_filter_16width_msa(const uint8_t *src,
int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 3 ;
int32_t res = height & 0x07 ;
v16u8 src0 , src1 , src2 , src3 , comp , diff ;
v16u8 temp0 , temp1 , temp2 , temp3 ;
v16u8 ref00 , ref01 , ref02 , ref03 , ref04 , ref10 , ref11 , ref12 , ref13 , ref14 ;
v8u16 comp0 , comp1 , comp2 , comp3 ;
v8u16 sad = { 0 } ;
for ( ht_cnt = ( height > > 3 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
LD_UB4 ( src , src_stride , src0 , src1 , src2 , src3 ) ;
src + = ( 4 * src_stride ) ;
LD_UB5 ( ref , ref_stride , ref04 , ref00 , ref01 , ref02 , ref03 ) ;
@ -389,6 +479,25 @@ static uint32_t sad_hv_bilinear_filter_16width_msa(const uint8_t *src,
diff = __msa_asub_u_b ( src3 , comp ) ;
sad + = __msa_hadd_u_h ( diff , diff ) ;
}
for ( ; res - - ; ) {
src0 = LD_UB ( src ) ;
LD_UB2 ( ref , ref_stride , ref00 , ref10 ) ;
LD_UB2 ( ref + 1 , ref_stride , ref01 , ref11 ) ;
src + = src_stride ;
ref + = ref_stride ;
ILVRL_B2_UB ( ref10 , ref00 , temp0 , temp1 ) ;
ILVRL_B2_UB ( ref11 , ref01 , temp2 , temp3 ) ;
comp0 = __msa_hadd_u_h ( temp0 , temp0 ) ;
comp1 = __msa_hadd_u_h ( temp1 , temp1 ) ;
comp2 = __msa_hadd_u_h ( temp2 , temp2 ) ;
comp3 = __msa_hadd_u_h ( temp3 , temp3 ) ;
comp2 + = comp0 ;
comp3 + = comp1 ;
SRARI_H2_UH ( comp2 , comp3 , 2 ) ;
comp = ( v16u8 ) __msa_pckev_b ( ( v16i8 ) comp3 , ( v16i8 ) comp2 ) ;
diff = __msa_asub_u_b ( src0 , comp ) ;
sad + = __msa_hadd_u_h ( diff , diff ) ;
}
return ( HADD_UH_U32 ( sad ) ) ;
}
@ -407,15 +516,17 @@ static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t * ref_ptr , int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 2 ;
int32_t res = height & 0x03 ;
uint32_t sse ;
uint32_t src0 , src1 , src2 , src3 ;
uint32_t ref0 , ref1 , ref2 , ref3 ;
v16u8 src = { 0 } ;
v16u8 ref = { 0 } ;
v4i32 var = { 0 } ;
v16u8 src = { 0 } ;
v16u8 ref = { 0 } ;
v16u8 zero = { 0 } ;
v4i32 var = { 0 } ;
for ( ht_cnt = ( height > > 2 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
LW4 ( src_ptr , src_stride , src0 , src1 , src2 , src3 ) ;
src_ptr + = ( 4 * src_stride ) ;
LW4 ( ref_ptr , ref_stride , ref0 , ref1 , ref2 , ref3 ) ;
@ -426,6 +537,20 @@ static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
CALC_MSE_B ( src , ref , var ) ;
}
for ( ; res - - ; ) {
v16u8 reg0 ;
v8i16 tmp0 ;
src0 = LW ( src_ptr ) ;
ref0 = LW ( ref_ptr ) ;
src_ptr + = src_stride ;
ref_ptr + = ref_stride ;
src = ( v16u8 ) __msa_insert_w ( ( v4i32 ) src , 0 , src0 ) ;
ref = ( v16u8 ) __msa_insert_w ( ( v4i32 ) ref , 0 , ref0 ) ;
reg0 = ( v16u8 ) __msa_ilvr_b ( src , ref ) ;
reg0 = ( v16u8 ) __msa_ilvr_d ( ( v2i64 ) zero , ( v2i64 ) reg0 ) ;
tmp0 = ( v8i16 ) __msa_hsub_u_h ( ( v16u8 ) reg0 , ( v16u8 ) reg0 ) ;
var = ( v4i32 ) __msa_dpadd_s_w ( ( v4i32 ) var , ( v8i16 ) tmp0 , ( v8i16 ) tmp0 ) ;
}
sse = HADD_SW_S32 ( var ) ;
return sse ;
@ -435,13 +560,14 @@ static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t * ref_ptr , int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 2 ;
int32_t res = height & 0x03 ;
uint32_t sse ;
v16u8 src0 , src1 , src2 , src3 ;
v16u8 ref0 , ref1 , ref2 , ref3 ;
v4i32 var = { 0 } ;
for ( ht_cnt = ( height > > 2 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
LD_UB4 ( src_ptr , src_stride , src0 , src1 , src2 , src3 ) ;
src_ptr + = ( 4 * src_stride ) ;
LD_UB4 ( ref_ptr , ref_stride , ref0 , ref1 , ref2 , ref3 ) ;
@ -453,6 +579,16 @@ static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
CALC_MSE_B ( src1 , ref1 , var ) ;
}
for ( ; res - - ; ) {
v8i16 tmp0 ;
src0 = LD_UB ( src_ptr ) ;
ref0 = LD_UB ( ref_ptr ) ;
src_ptr + = src_stride ;
ref_ptr + = ref_stride ;
ref1 = ( v16u8 ) __msa_ilvr_b ( src0 , ref0 ) ;
tmp0 = ( v8i16 ) __msa_hsub_u_h ( ( v16u8 ) ref1 , ( v16u8 ) ref1 ) ;
var = ( v4i32 ) __msa_dpadd_s_w ( ( v4i32 ) var , ( v8i16 ) tmp0 , ( v8i16 ) tmp0 ) ;
}
sse = HADD_SW_S32 ( var ) ;
return sse ;
@ -462,12 +598,13 @@ static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
const uint8_t * ref_ptr , int32_t ref_stride ,
int32_t height )
{
int32_t ht_cnt ;
int32_t ht_cnt = height > > 2 ;
int32_t res = height & 0x03 ;
uint32_t sse ;
v16u8 src , ref ;
v4i32 var = { 0 } ;
for ( ht_cnt = ( height > > 2 ) ; ht_cnt - - ; ) {
for ( ; ht_cnt - - ; ) {
src = LD_UB ( src_ptr ) ;
src_ptr + = src_stride ;
ref = LD_UB ( ref_ptr ) ;
@ -493,6 +630,14 @@ static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
CALC_MSE_B ( src , ref , var ) ;
}
for ( ; res - - ; ) {
src = LD_UB ( src_ptr ) ;
src_ptr + = src_stride ;
ref = LD_UB ( ref_ptr ) ;
ref_ptr + = ref_stride ;
CALC_MSE_B ( src , ref , var ) ;
}
sse = HADD_SW_S32 ( var ) ;
return sse ;
@ -544,7 +689,7 @@ static int32_t hadamard_diff_8x8_msa(const uint8_t *src, int32_t src_stride,
}
static int32_t hadamard_intra_8x8_msa ( const uint8_t * src , int32_t src_stride ,
const uint8_t * ref , int32_t ref_stride )
const uint8_t * dumy , int32_t ref_stride )
{
int32_t sum_res = 0 ;
v16u8 src0 , src1 , src2 , src3 , src4 , src5 , src6 , src7 ;
@ -659,10 +804,10 @@ int ff_hadamard8_diff8x8_msa(MpegEncContext *s, const uint8_t *dst, const uint8_
return hadamard_diff_8x8_msa ( src , stride , dst , stride ) ;
}
int ff_hadamard8_intra8x8_msa ( MpegEncContext * s , const uint8_t * dst , const uint8_t * src ,
int ff_hadamard8_intra8x8_msa ( MpegEncContext * s , const uint8_t * src , const uint8_t * dummy ,
ptrdiff_t stride , int h )
{
return hadamard_intra_8x8_msa ( src , stride , dst , stride ) ;
return hadamard_intra_8x8_msa ( src , stride , dummy , stride ) ;
}
/* Hadamard Transform functions */