@ -526,8 +526,7 @@ static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
}
}
static void avc_chroma_hv_2x2_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
static void avc_chroma_hv_2x2_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 )
{
@ -544,7 +543,7 @@ static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride,
mask = LD_SB ( & chroma_mask_arr [ 48 ] ) ;
LD_UB3 ( src , src_s tride , src0 , src1 , src2 ) ;
LD_UB3 ( src , stride , src0 , src1 , src2 ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
DOTP_UB2_UH ( src0 , src1 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
@ -558,12 +557,11 @@ static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride,
out1 = __msa_copy_u_h ( res_vert , 1 ) ;
SH ( out0 , dst ) ;
dst + = dst_ stride;
dst + = stride ;
SH ( out1 , dst ) ;
}
static void avc_chroma_hv_2x4_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
static void avc_chroma_hv_2x4_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 )
{
@ -580,7 +578,7 @@ static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride,
mask = LD_SB ( & chroma_mask_arr [ 48 ] ) ;
LD_UB5 ( src , src_s tride , src0 , src1 , src2 , src3 , src4 ) ;
LD_UB5 ( src , stride , src0 , src1 , src2 , src3 , src4 ) ;
VSHF_B2_UB ( src0 , src1 , src2 , src3 , mask , mask , tmp0 , tmp1 ) ;
VSHF_B2_UB ( src1 , src2 , src3 , src4 , mask , mask , tmp2 , tmp3 ) ;
@ -588,86 +586,30 @@ static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride,
DOTP_UB2_UH ( src0 , src1 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
res_vt0 + = res_vt1 ;
res_vt0 = ( v8u16 ) __msa_srari_h ( ( v8i16 ) res_vt0 , 6 ) ;
res_vt0 = __msa_sat_u_h ( res_vt0 , 7 ) ;
res = ( v8i16 ) __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
ST2x4_UB ( res , 0 , dst , dst_stride ) ;
}
static void avc_chroma_hv_2x8_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 )
{
v16u8 src0 , src1 , src2 , src3 , src4 , src5 , src6 , src7 , src8 ;
v16u8 tmp0 , tmp1 , tmp2 , tmp3 ;
v8u16 res_hz0 , res_hz1 , res_vt0 , res_vt1 ;
v8i16 res ;
v16i8 mask ;
v16i8 coeff_hz_vec0 = __msa_fill_b ( coef_hor0 ) ;
v16i8 coeff_hz_vec1 = __msa_fill_b ( coef_hor1 ) ;
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b ( coeff_hz_vec0 , coeff_hz_vec1 ) ;
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h ( coef_ver0 ) ;
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h ( coef_ver1 ) ;
mask = LD_SB ( & chroma_mask_arr [ 48 ] ) ;
LD_UB5 ( src , src_stride , src0 , src1 , src2 , src3 , src4 ) ;
src + = ( 5 * src_stride ) ;
LD_UB4 ( src , src_stride , src5 , src6 , src7 , src8 ) ;
VSHF_B2_UB ( src0 , src1 , src2 , src3 , mask , mask , tmp0 , tmp1 ) ;
VSHF_B2_UB ( src1 , src2 , src3 , src4 , mask , mask , tmp2 , tmp3 ) ;
ILVR_D2_UB ( tmp1 , tmp0 , tmp3 , tmp2 , src0 , src1 ) ;
VSHF_B2_UB ( src4 , src5 , src6 , src7 , mask , mask , tmp0 , tmp1 ) ;
VSHF_B2_UB ( src5 , src6 , src7 , src8 , mask , mask , tmp2 , tmp3 ) ;
ILVR_D2_UB ( tmp1 , tmp0 , tmp3 , tmp2 , src4 , src5 ) ;
DOTP_UB2_UH ( src0 , src1 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
res_vt0 + = res_vt1 ;
res_vt0 = ( v8u16 ) __msa_srari_h ( ( v8i16 ) res_vt0 , 6 ) ;
res_vt0 = __msa_sat_u_h ( res_vt0 , 7 ) ;
res = ( v8i16 ) __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
ST2x4_UB ( res , 0 , dst , dst_stride ) ;
dst + = ( 4 * dst_stride ) ;
DOTP_UB2_UH ( src4 , src5 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
res_vt0 + = res_vt1 ;
res_vt0 = ( v8u16 ) __msa_srari_h ( ( v8i16 ) res_vt0 , 6 ) ;
res_vt0 = __msa_sat_u_h ( res_vt0 , 7 ) ;
res = ( v8i16 ) __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
ST2x4_UB ( res , 0 , dst , dst_stride ) ;
ST2x4_UB ( res , 0 , dst , stride ) ;
}
static void avc_chroma_hv_2w_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
static void avc_chroma_hv_2w_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 ,
int32_t height )
{
if ( 2 = = height ) {
avc_chroma_hv_2x2_msa ( src , src_stride , dst , dst_stride , coef_ho r0,
coef_hor1 , coef_ver0 , coef_ ver1 ) ;
avc_chroma_hv_2x2_msa ( src , dst , stride , coef_hor0 , coef_hor1 , coef_ver0 ,
coef_ver1 ) ;
} else if ( 4 = = height ) {
avc_chroma_hv_2x4_msa ( src , src_stride , dst , dst_stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
} else if ( 8 = = height ) {
avc_chroma_hv_2x8_msa ( src , src_stride , dst , dst_stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
avc_chroma_hv_2x4_msa ( src , dst , stride , coef_hor0 , coef_hor1 , coef_ver0 ,
coef_ver1 ) ;
}
}
static void avc_chroma_hv_4x2_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
static void avc_chroma_hv_4x2_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 )
{
@ -682,7 +624,7 @@ static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride,
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h ( coef_ver1 ) ;
mask = LD_SB ( & chroma_mask_arr [ 0 ] ) ;
LD_UB3 ( src , src_s tride , src0 , src1 , src2 ) ;
LD_UB3 ( src , stride , src0 , src1 , src2 ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
DOTP_UB2_UH ( src0 , src1 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
@ -692,18 +634,13 @@ static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride,
res_vt0 = __msa_sat_u_h ( res_vt0 , 7 ) ;
res = ( v4i32 ) __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
ST4x2_UB ( res , dst , dst_ stride) ;
ST4x2_UB ( res , dst , stride ) ;
}
static void avc_chroma_hv_4x4multiple_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 ,
int32_t height )
static void avc_chroma_hv_4x4_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 )
{
uint32_t row ;
v16u8 src0 , src1 , src2 , src3 , src4 ;
v8u16 res_hz0 , res_hz1 , res_hz2 , res_hz3 ;
v8u16 res_vt0 , res_vt1 , res_vt2 , res_vt3 ;
@ -717,55 +654,82 @@ static void avc_chroma_hv_4x4multiple_msa(uint8_t *src, int32_t src_stride,
mask = LD_SB ( & chroma_mask_arr [ 0 ] ) ;
src0 = LD_UB ( src ) ;
src + = src_stride ;
LD_UB5 ( src , stride , src0 , src1 , src2 , src3 , src4 ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
VSHF_B2_UB ( src2 , src3 , src3 , src4 , mask , mask , src2 , src3 ) ;
DOTP_UB4_UH ( src0 , src1 , src2 , src3 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 , res_hz2 ,
res_hz3 ) ;
MUL4 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec1 ,
res_hz3 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 , res_vt3 ) ;
ADD2 ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , res_vt0 , res_vt1 ) ;
SRARI_H2_UH ( res_vt0 , res_vt1 , 6 ) ;
SAT_UH2_UH ( res_vt0 , res_vt1 , 7 ) ;
PCKEV_B2_SW ( res_vt0 , res_vt0 , res_vt1 , res_vt1 , res0 , res1 ) ;
ST4x4_UB ( res0 , res1 , 0 , 1 , 0 , 1 , dst , stride ) ;
}
for ( row = ( height > > 2 ) ; row - - ; ) {
LD_UB4 ( src , src_stride , src1 , src2 , src3 , src4 ) ;
src + = ( 4 * src_stride ) ;
static void avc_chroma_hv_4x8_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 )
{
v16u8 src0 , src1 , src2 , src3 , src4 , src5 , src6 , src7 , src8 , res0 , res1 ;
v8u16 res_hz0 , res_hz1 , res_hz2 , res_hz3 , res_hz4 , res_hz5 , res_hz6 , res_hz7 ;
v8u16 res_vt0 , res_vt1 , res_vt2 , res_vt3 , res_vt4 , res_vt5 , res_vt6 , res_vt7 ;
v16i8 mask ;
v16i8 coeff_hz_vec0 = __msa_fill_b ( coef_hor0 ) ;
v16i8 coeff_hz_vec1 = __msa_fill_b ( coef_hor1 ) ;
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b ( coeff_hz_vec0 , coeff_hz_vec1 ) ;
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h ( coef_ver0 ) ;
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h ( coef_ver1 ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
VSHF_B2_UB ( src2 , src3 , src3 , src4 , mask , mask , src2 , src3 ) ;
DOTP_UB4_UH ( src0 , src1 , src2 , src3 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 , res_hz2 ,
res_hz3 ) ;
MUL4 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_hz2 ,
coeff_vt_vec1 , res_hz3 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 ,
res_vt3 ) ;
ADD2 ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , res_vt0 , res_vt1 ) ;
SRARI_H2_UH ( res_vt0 , res_vt1 , 6 ) ;
SAT_UH2_UH ( res_vt0 , res_vt1 , 7 ) ;
PCKEV_B2_SW ( res_vt0 , res_vt0 , res_vt1 , res_vt1 , res0 , res1 ) ;
mask = LD_SB ( & chroma_mask_arr [ 0 ] ) ;
ST4x4_UB ( res0 , res1 , 0 , 1 , 0 , 1 , dst , dst_stride ) ;
dst + = ( 4 * dst_stride ) ;
src0 = src4 ;
}
LD_UB5 ( src , stride , src0 , src1 , src2 , src3 , src4 ) ;
src + = ( 5 * stride ) ;
LD_UB4 ( src , stride , src5 , src6 , src7 , src8 ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
VSHF_B2_UB ( src2 , src3 , src3 , src4 , mask , mask , src2 , src3 ) ;
VSHF_B2_UB ( src4 , src5 , src5 , src6 , mask , mask , src4 , src5 ) ;
VSHF_B2_UB ( src6 , src7 , src7 , src8 , mask , mask , src6 , src7 ) ;
DOTP_UB4_UH ( src0 , src1 , src2 , src3 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 , res_hz2 , res_hz3 ) ;
DOTP_UB4_UH ( src4 , src5 , src6 , src7 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz4 , res_hz5 , res_hz6 , res_hz7 ) ;
MUL4 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec1 ,
res_hz3 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 , res_vt3 ) ;
MUL4 ( res_hz4 , coeff_vt_vec1 , res_hz5 , coeff_vt_vec0 , res_hz6 , coeff_vt_vec1 ,
res_hz7 , coeff_vt_vec0 , res_vt4 , res_vt5 , res_vt6 , res_vt7 ) ;
ADD2 ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , res_vt0 , res_vt1 ) ;
ADD2 ( res_vt4 , res_vt5 , res_vt6 , res_vt7 , res_vt2 , res_vt3 ) ;
SRARI_H4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 6 ) ;
SAT_UH4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 7 ) ;
PCKEV_B2_UB ( res_vt1 , res_vt0 , res_vt3 , res_vt2 , res0 , res1 ) ;
ST4x8_UB ( res0 , res1 , dst , stride ) ;
}
static void avc_chroma_hv_4w_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
static void avc_chroma_hv_4w_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 ,
int32_t height )
{
if ( 2 = = height ) {
avc_chroma_hv_4x2_msa ( src , src_stride , dst , dst_stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
} else {
avc_chroma_hv_4x4multiple_msa ( src , src_stride , dst , dst_stride ,
coef_hor0 , coef_hor1 , coef_ver0 ,
coef_ver1 , height ) ;
avc_chroma_hv_4x2_msa ( src , dst , stride , coef_hor0 , coef_hor1 , coef_ver0 ,
coef_ver1 ) ;
} else if ( 4 = = height ) {
avc_chroma_hv_4x4_msa ( src , dst , stride , coef_hor0 , coef_hor1 , coef_ver0 ,
coef_ver1 ) ;
} else if ( 8 = = height ) {
avc_chroma_hv_4x8_msa ( src , dst , stride , coef_hor0 , coef_hor1 , coef_ver0 ,
coef_ver1 ) ;
}
}
static void avc_chroma_hv_8w_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 ,
int32_t height )
static void avc_chroma_hv_8x4_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 )
{
uint32_t row ;
v16u8 src0 , src1 , src2 , src3 , src4 , out0 , out1 ;
v8u16 res_hz0 , res_hz1 , res_hz2 , res_hz3 , res_hz4 ;
v8u16 res_vt0 , res_vt1 , res_vt2 , res_vt3 ;
@ -779,37 +743,99 @@ static void avc_chroma_hv_8w_msa(uint8_t *src, int32_t src_stride,
mask = LD_SB ( & chroma_mask_arr [ 32 ] ) ;
src0 = LD_UB ( src ) ;
src + = src_s tride ;
src + = stride ;
src0 = ( v16u8 ) __msa_vshf_b ( mask , ( v16i8 ) src0 , ( v16i8 ) src0 ) ;
res_hz0 = __msa_dotp_u_h ( src0 , coeff_hz_vec ) ;
for ( row = ( height > > 2 ) ; row - - ; ) {
LD_UB4 ( src , src_stride , src1 , src2 , src3 , src4 ) ;
src + = ( 4 * src_stride ) ;
LD_UB4 ( src , stride , src1 , src2 , src3 , src4 ) ;
src + = ( 4 * stride ) ;
VSHF_B2_UB ( src1 , src1 , src2 , src2 , mask , mask , src1 , src2 ) ;
VSHF_B2_UB ( src3 , src3 , src4 , src4 , mask , mask , src3 , src4 ) ;
DOTP_UB4_UH ( src1 , src2 , src3 , src4 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz1 , res_hz2 , res_hz3 ,
res_hz4 ) ;
MUL4 ( res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec0 , res_hz3 ,
coeff_vt_vec0 , res_hz4 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 ,
res_vt3 ) ;
VSHF_B2_UB ( src1 , src1 , src2 , src2 , mask , mask , src1 , src2 ) ;
VSHF_B2_UB ( src3 , src3 , src4 , src4 , mask , mask , src3 , src4 ) ;
DOTP_UB4_UH ( src1 , src2 , src3 , src4 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz1 , res_hz2 , res_hz3 , res_hz4 ) ;
MUL4 ( res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec0 , res_hz3 , coeff_vt_vec0 ,
res_hz4 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 , res_vt3 ) ;
res_vt0 + = ( res_hz0 * coeff_vt_vec1 ) ;
res_vt1 + = ( res_hz1 * coeff_vt_vec1 ) ;
res_vt2 + = ( res_hz2 * coeff_vt_vec1 ) ;
res_vt3 + = ( res_hz3 * coeff_vt_vec1 ) ;
res_vt0 + = ( res_hz0 * coeff_vt_vec1 ) ;
res_vt1 + = ( res_hz1 * coeff_vt_vec1 ) ;
res_vt2 + = ( res_hz2 * coeff_vt_vec1 ) ;
res_vt3 + = ( res_hz3 * coeff_vt_vec1 ) ;
SRARI_H4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 6 ) ;
SAT_UH4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 7 ) ;
PCKEV_B2_UB ( res_vt1 , res_vt0 , res_vt3 , res_vt2 , out0 , out1 ) ;
ST8x4_UB ( out0 , out1 , dst , dst_stride ) ;
SRARI_H4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 6 ) ;
SAT_UH4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 7 ) ;
PCKEV_B2_UB ( res_vt1 , res_vt0 , res_vt3 , res_vt2 , out0 , out1 ) ;
ST8x4_UB ( out0 , out1 , dst , stride ) ;
}
dst + = ( 4 * dst_stride ) ;
static void avc_chroma_hv_8x8_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 )
{
v16u8 src0 , src1 , src2 , src3 , src4 , src5 , src6 , src7 , src8 ;
v16u8 out0 , out1 , out2 , out3 ;
v8u16 res_hz0 , res_hz1 , res_hz2 , res_hz3 , res_hz4 ;
v8u16 res_hz5 , res_hz6 , res_hz7 , res_hz8 ;
v8u16 res_vt0 , res_vt1 , res_vt2 , res_vt3 ;
v8u16 res_vt4 , res_vt5 , res_vt6 , res_vt7 ;
v16i8 mask ;
v16i8 coeff_hz_vec0 = __msa_fill_b ( coef_hor0 ) ;
v16i8 coeff_hz_vec1 = __msa_fill_b ( coef_hor1 ) ;
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b ( coeff_hz_vec0 , coeff_hz_vec1 ) ;
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h ( coef_ver0 ) ;
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h ( coef_ver1 ) ;
res_hz0 = res_hz4 ;
mask = LD_SB ( & chroma_mask_arr [ 32 ] ) ;
LD_UB5 ( src , stride , src0 , src1 , src2 , src3 , src4 ) ;
src + = ( 5 * stride ) ;
LD_UB4 ( src , stride , src5 , src6 , src7 , src8 ) ;
src0 = ( v16u8 ) __msa_vshf_b ( mask , ( v16i8 ) src0 , ( v16i8 ) src0 ) ;
VSHF_B2_UB ( src1 , src1 , src2 , src2 , mask , mask , src1 , src2 ) ;
VSHF_B2_UB ( src3 , src3 , src4 , src4 , mask , mask , src3 , src4 ) ;
VSHF_B2_UB ( src5 , src5 , src6 , src6 , mask , mask , src5 , src6 ) ;
VSHF_B2_UB ( src7 , src7 , src8 , src8 , mask , mask , src7 , src8 ) ;
res_hz0 = __msa_dotp_u_h ( src0 , coeff_hz_vec ) ;
DOTP_UB4_UH ( src1 , src2 , src3 , src4 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz1 , res_hz2 , res_hz3 ,
res_hz4 ) ;
DOTP_UB4_UH ( src5 , src6 , src7 , src8 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz5 , res_hz6 , res_hz7 , res_hz8 ) ;
MUL4 ( res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec0 , res_hz3 ,
coeff_vt_vec0 , res_hz4 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 ,
res_vt3 ) ;
MUL4 ( res_hz5 , coeff_vt_vec0 , res_hz6 , coeff_vt_vec0 , res_hz7 ,
coeff_vt_vec0 , res_hz8 , coeff_vt_vec0 , res_vt4 , res_vt5 , res_vt6 ,
res_vt7 ) ;
res_vt0 + = ( res_hz0 * coeff_vt_vec1 ) ;
res_vt1 + = ( res_hz1 * coeff_vt_vec1 ) ;
res_vt2 + = ( res_hz2 * coeff_vt_vec1 ) ;
res_vt3 + = ( res_hz3 * coeff_vt_vec1 ) ;
res_vt4 + = ( res_hz4 * coeff_vt_vec1 ) ;
res_vt5 + = ( res_hz5 * coeff_vt_vec1 ) ;
res_vt6 + = ( res_hz6 * coeff_vt_vec1 ) ;
res_vt7 + = ( res_hz7 * coeff_vt_vec1 ) ;
SRARI_H4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 6 ) ;
SRARI_H4_UH ( res_vt4 , res_vt5 , res_vt6 , res_vt7 , 6 ) ;
SAT_UH4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 7 ) ;
SAT_UH4_UH ( res_vt4 , res_vt5 , res_vt6 , res_vt7 , 7 ) ;
PCKEV_B2_UB ( res_vt1 , res_vt0 , res_vt3 , res_vt2 , out0 , out1 ) ;
PCKEV_B2_UB ( res_vt5 , res_vt4 , res_vt7 , res_vt6 , out2 , out3 ) ;
ST8x8_UB ( out0 , out1 , out2 , out3 , dst , stride ) ;
}
static void avc_chroma_hv_8w_msa ( uint8_t * src , uint8_t * dst , int32_t stride ,
uint32_t coef_hor0 , uint32_t coef_hor1 ,
uint32_t coef_ver0 , uint32_t coef_ver1 ,
int32_t height )
{
if ( 4 = = height ) {
avc_chroma_hv_8x4_msa ( src , dst , stride , coef_hor0 , coef_hor1 , coef_ver0 ,
coef_ver1 ) ;
} else if ( 8 = = height ) {
avc_chroma_hv_8x8_msa ( src , dst , stride , coef_hor0 , coef_hor1 , coef_ver0 ,
coef_ver1 ) ;
}
}
@ -1896,8 +1922,7 @@ void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
av_assert2 ( x < 8 & & y < 8 & & x > = 0 & & y > = 0 ) ;
if ( x & & y ) {
avc_chroma_hv_8w_msa ( src , stride , dst ,
stride , x , ( 8 - x ) , y , ( 8 - y ) , height ) ;
avc_chroma_hv_8w_msa ( src , dst , stride , x , ( 8 - x ) , y , ( 8 - y ) , height ) ;
} else if ( x ) {
avc_chroma_hz_8w_msa ( src , dst , stride , x , ( 8 - x ) , height ) ;
} else if ( y ) {
@ -1915,8 +1940,7 @@ void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
av_assert2 ( x < 8 & & y < 8 & & x > = 0 & & y > = 0 ) ;
if ( x & & y ) {
avc_chroma_hv_4w_msa ( src , stride , dst ,
stride , x , ( 8 - x ) , y , ( 8 - y ) , height ) ;
avc_chroma_hv_4w_msa ( src , dst , stride , x , ( 8 - x ) , y , ( 8 - y ) , height ) ;
} else if ( x ) {
avc_chroma_hz_4w_msa ( src , dst , stride , x , ( 8 - x ) , height ) ;
} else if ( y ) {
@ -1939,8 +1963,7 @@ void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
av_assert2 ( x < 8 & & y < 8 & & x > = 0 & & y > = 0 ) ;
if ( x & & y ) {
avc_chroma_hv_2w_msa ( src , stride , dst ,
stride , x , ( 8 - x ) , y , ( 8 - y ) , height ) ;
avc_chroma_hv_2w_msa ( src , dst , stride , x , ( 8 - x ) , y , ( 8 - y ) , height ) ;
} else if ( x ) {
avc_chroma_hz_2w_msa ( src , dst , stride , x , ( 8 - x ) , height ) ;
} else if ( y ) {