@ -1408,15 +1408,15 @@ static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
}
}
static void avc_chroma_hv_and_aver_dst_2x2_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_ stride,
static void avc_chroma_hv_and_aver_dst_2x2_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 )
{
uint16_t out0 , out1 ;
v16u8 dst0 , dst1 ;
v16u8 dst0 = { 0 } ;
v16u8 src0 , src1 , src2 ;
v8u16 res_hz0 , res_hz1 , res_vt0 , res_vt1 ;
v16i8 res , mask ;
@ -1428,8 +1428,11 @@ static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
mask = LD_SB ( & chroma_mask_arr [ 48 ] ) ;
LD_UB3 ( src , src_stride , src0 , src1 , src2 ) ;
LD_UB2 ( dst , dst_stride , dst0 , dst1 ) ;
LD_UB3 ( src , stride , src0 , src1 , src2 ) ;
out0 = LH ( dst ) ;
out1 = LH ( dst + stride ) ;
dst0 = ( v16u8 ) __msa_insert_h ( ( v8i16 ) dst0 , 0 , out0 ) ;
dst0 = ( v16u8 ) __msa_insert_h ( ( v8i16 ) dst0 , 1 , out1 ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
DOTP_UB2_UH ( src0 , src1 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
@ -1438,67 +1441,26 @@ static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride,
res_vt0 = ( v8u16 ) __msa_srari_h ( ( v8i16 ) res_vt0 , 6 ) ;
res_vt0 = __msa_sat_u_h ( res_vt0 , 7 ) ;
res = __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
dst0 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst0 , 1 , ( v8i16 ) dst1 ) ;
dst0 = __msa_aver_u_b ( ( v16u8 ) res , dst0 ) ;
out0 = __msa_copy_u_h ( ( v8i16 ) dst0 , 0 ) ;
out1 = __msa_copy_u_h ( ( v8i16 ) dst0 , 1 ) ;
SH ( out0 , dst ) ;
dst + = dst_ stride;
dst + = stride ;
SH ( out1 , dst ) ;
}
static void avc_chroma_hv_and_aver_dst_2x4_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_ stride,
static void avc_chroma_hv_and_aver_dst_2x4_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 )
{
uint16_t tp0 , tp1 , tp2 , tp3 ;
v16u8 src0 , src1 , src2 , src3 , src4 ;
v16u8 tmp0 , tmp1 , tmp2 , tmp3 ;
v16u8 dst0 , dst1 , dst2 , dst3 ;
v8u16 res_hz0 , res_hz1 , res_vt0 , res_vt1 ;
v16i8 res , mask ;
v16i8 coeff_hz_vec0 = __msa_fill_b ( coef_hor0 ) ;
v16i8 coeff_hz_vec1 = __msa_fill_b ( coef_hor1 ) ;
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b ( coeff_hz_vec0 , coeff_hz_vec1 ) ;
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h ( coef_ver0 ) ;
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h ( coef_ver1 ) ;
mask = LD_SB ( & chroma_mask_arr [ 48 ] ) ;
LD_UB5 ( src , src_stride , src0 , src1 , src2 , src3 , src4 ) ;
LD_UB4 ( dst , dst_stride , dst0 , dst1 , dst2 , dst3 ) ;
VSHF_B2_UB ( src0 , src1 , src2 , src3 , mask , mask , tmp0 , tmp1 ) ;
VSHF_B2_UB ( src1 , src2 , src3 , src4 , mask , mask , tmp2 , tmp3 ) ;
ILVR_D2_UB ( tmp1 , tmp0 , tmp3 , tmp2 , src0 , src1 ) ;
DOTP_UB2_UH ( src0 , src1 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
res_vt0 + = res_vt1 ;
res_vt0 = ( v8u16 ) __msa_srari_h ( ( v8i16 ) res_vt0 , 6 ) ;
res_vt0 = __msa_sat_u_h ( res_vt0 , 7 ) ;
res = __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
dst0 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst0 , 1 , ( v8i16 ) dst1 ) ;
dst0 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst0 , 2 , ( v8i16 ) dst2 ) ;
dst0 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst0 , 3 , ( v8i16 ) dst3 ) ;
dst0 = __msa_aver_u_b ( ( v16u8 ) res , dst0 ) ;
ST2x4_UB ( dst0 , 0 , dst , dst_stride ) ;
}
static void avc_chroma_hv_and_aver_dst_2x8_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 )
{
v16u8 src0 , src1 , src2 , src3 , src4 , src5 , src6 , src7 , src8 ;
v16u8 dst0 , dst1 , dst2 , dst3 , dst4 , dst5 , dst6 , dst7 ;
v16u8 tmp0 , tmp1 , tmp2 , tmp3 ;
v16u8 dst0 = { 0 } ;
v8u16 res_hz0 , res_hz1 , res_vt0 , res_vt1 ;
v16i8 res , mask ;
v16i8 coeff_hz_vec0 = __msa_fill_b ( coef_hor0 ) ;
@ -1509,26 +1471,18 @@ static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
mask = LD_SB ( & chroma_mask_arr [ 48 ] ) ;
LD_UB5 ( src , src_stride , src0 , src1 , src2 , src3 , src4 ) ;
src + = ( 5 * src_stride ) ;
LD_UB4 ( src , src_stride , src5 , src6 , src7 , src8 ) ;
LD_UB8 ( dst , dst_stride , dst0 , dst1 , dst2 , dst3 , dst4 , dst5 , dst6 , dst7 ) ;
dst0 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst0 , 1 , ( v8i16 ) dst1 ) ;
dst0 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst0 , 2 , ( v8i16 ) dst2 ) ;
dst0 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst0 , 3 , ( v8i16 ) dst3 ) ;
dst4 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst4 , 1 , ( v8i16 ) dst5 ) ;
dst4 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst4 , 2 , ( v8i16 ) dst6 ) ;
dst4 = ( v16u8 ) __msa_insve_h ( ( v8i16 ) dst4 , 3 , ( v8i16 ) dst7 ) ;
LD_UB5 ( src , stride , src0 , src1 , src2 , src3 , src4 ) ;
tp0 = LH ( dst ) ;
tp1 = LH ( dst + stride ) ;
tp2 = LH ( dst + 2 * stride ) ;
tp3 = LH ( dst + 3 * stride ) ;
dst0 = ( v16u8 ) __msa_insert_h ( ( v8i16 ) dst0 , 0 , tp0 ) ;
dst0 = ( v16u8 ) __msa_insert_h ( ( v8i16 ) dst0 , 1 , tp1 ) ;
dst0 = ( v16u8 ) __msa_insert_h ( ( v8i16 ) dst0 , 2 , tp2 ) ;
dst0 = ( v16u8 ) __msa_insert_h ( ( v8i16 ) dst0 , 3 , tp3 ) ;
VSHF_B2_UB ( src0 , src1 , src2 , src3 , mask , mask , tmp0 , tmp1 ) ;
VSHF_B2_UB ( src1 , src2 , src3 , src4 , mask , mask , tmp2 , tmp3 ) ;
ILVR_D2_UB ( tmp1 , tmp0 , tmp3 , tmp2 , src0 , src1 ) ;
VSHF_B2_UB ( src4 , src5 , src6 , src7 , mask , mask , tmp0 , tmp1 ) ;
VSHF_B2_UB ( src5 , src6 , src7 , src8 , mask , mask , tmp2 , tmp3 ) ;
ILVR_D2_UB ( tmp1 , tmp0 , tmp3 , tmp2 , src4 , src5 ) ;
DOTP_UB2_UH ( src0 , src1 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
@ -1538,23 +1492,11 @@ static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride,
res = __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
dst0 = __msa_aver_u_b ( ( v16u8 ) res , dst0 ) ;
ST2x4_UB ( dst0 , 0 , dst , dst_stride ) ;
dst + = ( 4 * dst_stride ) ;
DOTP_UB2_UH ( src4 , src5 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
res_vt0 + = res_vt1 ;
res_vt0 = ( v8u16 ) __msa_srari_h ( ( v8i16 ) res_vt0 , 6 ) ;
res_vt0 = __msa_sat_u_h ( res_vt0 , 7 ) ;
res = __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
dst4 = __msa_aver_u_b ( ( v16u8 ) res , dst4 ) ;
ST2x4_UB ( dst4 , 0 , dst , dst_stride ) ;
ST2x4_UB ( dst0 , 0 , dst , stride ) ;
}
static void avc_chroma_hv_and_aver_dst_2w_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_ stride,
static void avc_chroma_hv_and_aver_dst_2w_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
@ -1562,31 +1504,26 @@ static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride,
int32_t height )
{
if ( 2 = = height ) {
avc_chroma_hv_and_aver_dst_2x2_msa ( src , src_stride , dst , dst_stride ,
coef_hor0 , coef_hor1 ,
coef_ver0 , coef_ver1 ) ;
avc_chroma_hv_and_aver_dst_2x2_msa ( src , dst , stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
} else if ( 4 = = height ) {
avc_chroma_hv_and_aver_dst_2x4_msa ( src , src_stride , dst , dst_stride ,
coef_hor0 , coef_hor1 ,
coef_ver0 , coef_ver1 ) ;
} else if ( 8 = = height ) {
avc_chroma_hv_and_aver_dst_2x8_msa ( src , src_stride , dst , dst_stride ,
coef_hor0 , coef_hor1 ,
coef_ver0 , coef_ver1 ) ;
avc_chroma_hv_and_aver_dst_2x4_msa ( src , dst , stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
}
}
static void avc_chroma_hv_and_aver_dst_4x2_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_ stride,
static void avc_chroma_hv_and_aver_dst_4x2_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 )
{
uint32_t tp0 , tp1 ;
v16u8 src0 , src1 , src2 ;
v16u8 dst0 , dst1 ;
v16u8 dst0 , dst_data = { 0 } ;
v8u16 res_hz0 , res_hz1 , res_vt0 , res_vt1 ;
v16i8 res , mask ;
v16i8 mask ;
v16i8 coeff_hz_vec0 = __msa_fill_b ( coef_hor0 ) ;
v16i8 coeff_hz_vec1 = __msa_fill_b ( coef_hor1 ) ;
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b ( coeff_hz_vec0 , coeff_hz_vec1 ) ;
@ -1595,8 +1532,9 @@ static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
mask = LD_SB ( & chroma_mask_arr [ 0 ] ) ;
LD_UB3 ( src , src_stride , src0 , src1 , src2 ) ;
LD_UB2 ( dst , dst_stride , dst0 , dst1 ) ;
LD_UB3 ( src , stride , src0 , src1 , src2 ) ;
LW2 ( dst , stride , tp0 , tp1 ) ;
INSERT_W2_UB ( tp0 , tp1 , dst_data ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
DOTP_UB2_UH ( src0 , src1 , coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 ) ;
MUL2 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_vt0 , res_vt1 ) ;
@ -1604,26 +1542,22 @@ static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride,
res_vt0 + = res_vt1 ;
res_vt0 = ( v8u16 ) __msa_srari_h ( ( v8i16 ) res_vt0 , 6 ) ;
res_vt0 = __msa_sat_u_h ( res_vt0 , 7 ) ;
res = __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
dst0 = ( v16u8 ) __msa_insve_w ( ( v4i32 ) dst0 , 1 , ( v4i32 ) dst1 ) ;
dst0 = __msa_aver_u_b ( ( v16u8 ) res , dst0 ) ;
dst0 = ( v16u8 ) __msa_pckev_b ( ( v16i8 ) res_vt0 , ( v16i8 ) res_vt0 ) ;
dst0 = __msa_aver_u_b ( dst0 , dst_data ) ;
ST4x2_UB ( dst0 , dst , dst_ stride) ;
ST4x2_UB ( dst0 , dst , stride ) ;
}
static void avc_chroma_hv_and_aver_dst_4x4mul_msa ( uint8_t * src ,
int32_t src_stride ,
uint8_t * dst ,
int32_t dst_stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 ,
int32_t height )
static void avc_chroma_hv_and_aver_dst_4x4_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 )
{
uint32_t row ;
uint32_t tp0 , tp1 , tp2 , tp3 ;
v16u8 src0 , src1 , src2 , src3 , src4 ;
v16u8 dst0 , dst1 , dst2 , dst3 ;
v16u8 out , dst_data = { 0 } ;
v8u16 res_hz0 , res_hz1 , res_hz2 , res_hz3 ;
v8u16 res_vt0 , res_vt1 , res_vt2 , res_vt3 ;
v16i8 mask ;
@ -1632,45 +1566,78 @@ static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src,
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b ( coeff_hz_vec0 , coeff_hz_vec1 ) ;
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h ( coef_ver0 ) ;
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h ( coef_ver1 ) ;
v16u8 res0 , res1 ;
mask = LD_SB ( & chroma_mask_arr [ 0 ] ) ;
src0 = LD_UB ( src ) ;
src + = src_stride ;
for ( row = ( height > > 2 ) ; row - - ; ) {
LD_UB4 ( src , src_stride , src1 , src2 , src3 , src4 ) ;
src + = ( 4 * src_stride ) ;
LD_UB4 ( dst , dst_stride , dst0 , dst1 , dst2 , dst3 ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
VSHF_B2_UB ( src2 , src3 , src3 , src4 , mask , mask , src2 , src3 ) ;
DOTP_UB4_UH ( src0 , src1 , src2 , src3 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 , res_hz2 ,
res_hz3 ) ;
MUL4 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_hz2 ,
coeff_vt_vec1 , res_hz3 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 ,
res_vt3 ) ;
ADD2 ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , res_vt0 , res_vt1 ) ;
SRARI_H2_UH ( res_vt0 , res_vt1 , 6 ) ;
SAT_UH2_UH ( res_vt0 , res_vt1 , 7 ) ;
PCKEV_B2_UB ( res_vt0 , res_vt0 , res_vt1 , res_vt1 , res0 , res1 ) ;
dst0 = ( v16u8 ) __msa_insve_w ( ( v4i32 ) dst0 , 1 , ( v4i32 ) dst1 ) ;
dst1 = ( v16u8 ) __msa_insve_w ( ( v4i32 ) dst2 , 1 , ( v4i32 ) dst3 ) ;
AVER_UB2_UB ( res0 , dst0 , res1 , dst1 , dst0 , dst1 ) ;
ST4x4_UB ( dst0 , dst1 , 0 , 1 , 0 , 1 , dst , dst_stride ) ;
dst + = ( 4 * dst_stride ) ;
src0 = src4 ;
}
LD_UB5 ( src , stride , src0 , src1 , src2 , src3 , src4 ) ;
LW4 ( dst , stride , tp0 , tp1 , tp2 , tp3 ) ;
INSERT_W4_UB ( tp0 , tp1 , tp2 , tp3 , dst_data ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
VSHF_B2_UB ( src2 , src3 , src3 , src4 , mask , mask , src2 , src3 ) ;
DOTP_UB4_UH ( src0 , src1 , src2 , src3 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 , res_hz2 ,
res_hz3 ) ;
MUL4 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec1 ,
res_hz3 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 , res_vt3 ) ;
ADD2 ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , res_vt0 , res_vt1 ) ;
SRARI_H2_UH ( res_vt0 , res_vt1 , 6 ) ;
SAT_UH2_UH ( res_vt0 , res_vt1 , 7 ) ;
out = ( v16u8 ) __msa_pckev_b ( ( v16i8 ) res_vt1 , ( v16i8 ) res_vt0 ) ;
out = __msa_aver_u_b ( out , dst_data ) ;
ST4x4_UB ( out , out , 0 , 1 , 2 , 3 , dst , stride ) ;
}
static void avc_chroma_hv_and_aver_dst_4x8_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 )
{
uint32_t tp0 , tp1 , tp2 , tp3 ;
v16u8 src0 , src1 , src2 , src3 , src4 , src5 , src6 , src7 , src8 , res0 , res1 ;
v16u8 dst0 = { 0 } , dst1 = { 0 } ;
v8u16 res_hz0 , res_hz1 , res_hz2 , res_hz3 , res_hz4 , res_hz5 , res_hz6 , res_hz7 ;
v8u16 res_vt0 , res_vt1 , res_vt2 , res_vt3 , res_vt4 , res_vt5 , res_vt6 , res_vt7 ;
v16i8 mask ;
v16i8 coeff_hz_vec0 = __msa_fill_b ( coef_hor0 ) ;
v16i8 coeff_hz_vec1 = __msa_fill_b ( coef_hor1 ) ;
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b ( coeff_hz_vec0 , coeff_hz_vec1 ) ;
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h ( coef_ver0 ) ;
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h ( coef_ver1 ) ;
mask = LD_SB ( & chroma_mask_arr [ 0 ] ) ;
LD_UB5 ( src , stride , src0 , src1 , src2 , src3 , src4 ) ;
src + = ( 5 * stride ) ;
LD_UB4 ( src , stride , src5 , src6 , src7 , src8 ) ;
LW4 ( dst , stride , tp0 , tp1 , tp2 , tp3 ) ;
INSERT_W4_UB ( tp0 , tp1 , tp2 , tp3 , dst0 ) ;
LW4 ( dst + 4 * stride , stride , tp0 , tp1 , tp2 , tp3 ) ;
INSERT_W4_UB ( tp0 , tp1 , tp2 , tp3 , dst1 ) ;
VSHF_B2_UB ( src0 , src1 , src1 , src2 , mask , mask , src0 , src1 ) ;
VSHF_B2_UB ( src2 , src3 , src3 , src4 , mask , mask , src2 , src3 ) ;
VSHF_B2_UB ( src4 , src5 , src5 , src6 , mask , mask , src4 , src5 ) ;
VSHF_B2_UB ( src6 , src7 , src7 , src8 , mask , mask , src6 , src7 ) ;
DOTP_UB4_UH ( src0 , src1 , src2 , src3 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz0 , res_hz1 , res_hz2 , res_hz3 ) ;
DOTP_UB4_UH ( src4 , src5 , src6 , src7 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz4 , res_hz5 , res_hz6 , res_hz7 ) ;
MUL4 ( res_hz0 , coeff_vt_vec1 , res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec1 ,
res_hz3 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 , res_vt3 ) ;
MUL4 ( res_hz4 , coeff_vt_vec1 , res_hz5 , coeff_vt_vec0 , res_hz6 , coeff_vt_vec1 ,
res_hz7 , coeff_vt_vec0 , res_vt4 , res_vt5 , res_vt6 , res_vt7 ) ;
ADD2 ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , res_vt0 , res_vt1 ) ;
ADD2 ( res_vt4 , res_vt5 , res_vt6 , res_vt7 , res_vt2 , res_vt3 ) ;
SRARI_H4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 6 ) ;
SAT_UH4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 7 ) ;
PCKEV_B2_UB ( res_vt1 , res_vt0 , res_vt3 , res_vt2 , res0 , res1 ) ;
AVER_UB2_UB ( res0 , dst0 , res1 , dst1 , res0 , res1 ) ;
ST4x8_UB ( res0 , res1 , dst , stride ) ;
}
static void avc_chroma_hv_and_aver_dst_4w_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
static void avc_chroma_hv_and_aver_dst_4w_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
@ -1678,30 +1645,30 @@ static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride,
int32_t height )
{
if ( 2 = = height ) {
avc_chroma_hv_and_aver_dst_4x2_msa ( src , src_stride , dst , dst_stride ,
coef_hor0 , coef_hor1 ,
coef_ver0 , coef_ver1 ) ;
} else {
avc_chroma_hv_and_aver_dst_4x4mul_msa ( src , src_stride , dst , dst_stride ,
coef_hor0 , coef_hor1 ,
coef_ver0 , coef_ver1 , height ) ;
avc_chroma_hv_and_aver_dst_4x2_msa ( src , dst , stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
} else if ( 4 = = height ) {
avc_chroma_hv_and_aver_dst_4x4_msa ( src , dst , stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
} else if ( 8 = = height ) {
avc_chroma_hv_and_aver_dst_4x8_msa ( src , dst , stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
}
}
static void avc_chroma_hv_and_aver_dst_8w_msa ( uint8_t * src , int32_t src_stride ,
uint8_t * dst , int32_t dst_stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 ,
int32_t height )
static void avc_chroma_hv_and_aver_dst_8x4_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 )
{
uint32_t row ;
uint64_t tp0 , tp1 , tp2 , tp3 ;
v16u8 src0 , src1 , src2 , src3 , src4 , out0 , out1 ;
v8u16 res_hz0 , res_hz1 , res_hz2 ;
v8u16 res_hz3 , res_hz4 ;
v8u16 res_vt0 , res_vt1 , res_vt2 , res_vt3 ;
v16u8 dst0 , dst1 , dst2 , dst3 ;
v16u8 dst0 = { 0 } , dst1 = { 0 } ;
v16i8 mask ;
v16i8 coeff_hz_vec0 = __msa_fill_b ( coef_hor0 ) ;
v16i8 coeff_hz_vec1 = __msa_fill_b ( coef_hor1 ) ;
@ -1712,40 +1679,114 @@ static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride,
mask = LD_SB ( & chroma_mask_arr [ 32 ] ) ;
src0 = LD_UB ( src ) ;
src + = src_stride ;
src + = stride ;
src0 = ( v16u8 ) __msa_vshf_b ( mask , ( v16i8 ) src0 , ( v16i8 ) src0 ) ;
res_hz0 = __msa_dotp_u_h ( src0 , coeff_hz_vec ) ;
LD_UB4 ( src , stride , src1 , src2 , src3 , src4 ) ;
src + = ( 4 * stride ) ;
LD4 ( dst , stride , tp0 , tp1 , tp2 , tp3 ) ;
INSERT_D2_UB ( tp0 , tp1 , dst0 ) ;
INSERT_D2_UB ( tp2 , tp3 , dst1 ) ;
VSHF_B2_UB ( src1 , src1 , src2 , src2 , mask , mask , src1 , src2 ) ;
VSHF_B2_UB ( src3 , src3 , src4 , src4 , mask , mask , src3 , src4 ) ;
DOTP_UB4_UH ( src1 , src2 , src3 , src4 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz1 , res_hz2 , res_hz3 , res_hz4 ) ;
MUL4 ( res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec0 , res_hz3 , coeff_vt_vec0 ,
res_hz4 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 , res_vt3 ) ;
res_vt0 + = ( res_hz0 * coeff_vt_vec1 ) ;
res_vt1 + = ( res_hz1 * coeff_vt_vec1 ) ;
res_vt2 + = ( res_hz2 * coeff_vt_vec1 ) ;
res_vt3 + = ( res_hz3 * coeff_vt_vec1 ) ;
SRARI_H4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 6 ) ;
SAT_UH4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 7 ) ;
PCKEV_B2_UB ( res_vt1 , res_vt0 , res_vt3 , res_vt2 , out0 , out1 ) ;
AVER_UB2_UB ( out0 , dst0 , out1 , dst1 , out0 , out1 ) ;
ST8x4_UB ( out0 , out1 , dst , stride ) ;
}
static void avc_chroma_hv_and_aver_dst_8x8_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 )
{
uint64_t tp0 , tp1 , tp2 , tp3 ;
v16u8 src0 , src1 , src2 , src3 , src4 , src5 , src6 , src7 , src8 ;
v16u8 out0 , out1 , out2 , out3 ;
v16u8 dst0 = { 0 } , dst1 = { 0 } , dst2 = { 0 } , dst3 = { 0 } ;
v8u16 res_hz0 , res_hz1 , res_hz2 , res_hz3 , res_hz4 ;
v8u16 res_hz5 , res_hz6 , res_hz7 , res_hz8 ;
v8u16 res_vt0 , res_vt1 , res_vt2 , res_vt3 ;
v8u16 res_vt4 , res_vt5 , res_vt6 , res_vt7 ;
v16i8 mask ;
v16i8 coeff_hz_vec0 = __msa_fill_b ( coef_hor0 ) ;
v16i8 coeff_hz_vec1 = __msa_fill_b ( coef_hor1 ) ;
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b ( coeff_hz_vec0 , coeff_hz_vec1 ) ;
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h ( coef_ver0 ) ;
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h ( coef_ver1 ) ;
mask = LD_SB ( & chroma_mask_arr [ 32 ] ) ;
LD_UB5 ( src , stride , src0 , src1 , src2 , src3 , src4 ) ;
src + = ( 5 * stride ) ;
LD_UB4 ( src , stride , src5 , src6 , src7 , src8 ) ;
src0 = ( v16u8 ) __msa_vshf_b ( mask , ( v16i8 ) src0 , ( v16i8 ) src0 ) ;
VSHF_B2_UB ( src1 , src1 , src2 , src2 , mask , mask , src1 , src2 ) ;
VSHF_B2_UB ( src3 , src3 , src4 , src4 , mask , mask , src3 , src4 ) ;
VSHF_B2_UB ( src5 , src5 , src6 , src6 , mask , mask , src5 , src6 ) ;
VSHF_B2_UB ( src7 , src7 , src8 , src8 , mask , mask , src7 , src8 ) ;
res_hz0 = __msa_dotp_u_h ( src0 , coeff_hz_vec ) ;
DOTP_UB4_UH ( src1 , src2 , src3 , src4 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz1 , res_hz2 , res_hz3 ,
res_hz4 ) ;
DOTP_UB4_UH ( src5 , src6 , src7 , src8 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz5 , res_hz6 , res_hz7 , res_hz8 ) ;
MUL4 ( res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec0 , res_hz3 ,
coeff_vt_vec0 , res_hz4 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 ,
res_vt3 ) ;
MUL4 ( res_hz5 , coeff_vt_vec0 , res_hz6 , coeff_vt_vec0 , res_hz7 ,
coeff_vt_vec0 , res_hz8 , coeff_vt_vec0 , res_vt4 , res_vt5 , res_vt6 ,
res_vt7 ) ;
LD4 ( dst , stride , tp0 , tp1 , tp2 , tp3 ) ;
INSERT_D2_UB ( tp0 , tp1 , dst0 ) ;
INSERT_D2_UB ( tp2 , tp3 , dst1 ) ;
LD4 ( dst + 4 * stride , stride , tp0 , tp1 , tp2 , tp3 ) ;
INSERT_D2_UB ( tp0 , tp1 , dst2 ) ;
INSERT_D2_UB ( tp2 , tp3 , dst3 ) ;
res_vt0 + = ( res_hz0 * coeff_vt_vec1 ) ;
res_vt1 + = ( res_hz1 * coeff_vt_vec1 ) ;
res_vt2 + = ( res_hz2 * coeff_vt_vec1 ) ;
res_vt3 + = ( res_hz3 * coeff_vt_vec1 ) ;
res_vt4 + = ( res_hz4 * coeff_vt_vec1 ) ;
res_vt5 + = ( res_hz5 * coeff_vt_vec1 ) ;
res_vt6 + = ( res_hz6 * coeff_vt_vec1 ) ;
res_vt7 + = ( res_hz7 * coeff_vt_vec1 ) ;
SRARI_H4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 6 ) ;
SRARI_H4_UH ( res_vt4 , res_vt5 , res_vt6 , res_vt7 , 6 ) ;
SAT_UH4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 7 ) ;
SAT_UH4_UH ( res_vt4 , res_vt5 , res_vt6 , res_vt7 , 7 ) ;
PCKEV_B2_UB ( res_vt1 , res_vt0 , res_vt3 , res_vt2 , out0 , out1 ) ;
PCKEV_B2_UB ( res_vt5 , res_vt4 , res_vt7 , res_vt6 , out2 , out3 ) ;
AVER_UB2_UB ( out0 , dst0 , out1 , dst1 , out0 , out1 ) ;
AVER_UB2_UB ( out2 , dst2 , out3 , dst3 , out2 , out3 ) ;
ST8x8_UB ( out0 , out1 , out2 , out3 , dst , stride ) ;
}
for ( row = ( height > > 2 ) ; row - - ; ) {
LD_UB4 ( src , src_stride , src1 , src2 , src3 , src4 ) ;
src + = ( 4 * src_stride ) ;
LD_UB4 ( dst , dst_stride , dst0 , dst1 , dst2 , dst3 ) ;
VSHF_B2_UB ( src1 , src1 , src2 , src2 , mask , mask , src1 , src2 ) ;
VSHF_B2_UB ( src3 , src3 , src4 , src4 , mask , mask , src3 , src4 ) ;
DOTP_UB4_UH ( src1 , src2 , src3 , src4 , coeff_hz_vec , coeff_hz_vec ,
coeff_hz_vec , coeff_hz_vec , res_hz1 , res_hz2 , res_hz3 ,
res_hz4 ) ;
MUL4 ( res_hz1 , coeff_vt_vec0 , res_hz2 , coeff_vt_vec0 , res_hz3 ,
coeff_vt_vec0 , res_hz4 , coeff_vt_vec0 , res_vt0 , res_vt1 , res_vt2 ,
res_vt3 ) ;
res_vt0 + = ( res_hz0 * coeff_vt_vec1 ) ;
res_vt1 + = ( res_hz1 * coeff_vt_vec1 ) ;
res_vt2 + = ( res_hz2 * coeff_vt_vec1 ) ;
res_vt3 + = ( res_hz3 * coeff_vt_vec1 ) ;
SRARI_H4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 6 ) ;
SAT_UH4_UH ( res_vt0 , res_vt1 , res_vt2 , res_vt3 , 7 ) ;
PCKEV_B2_UB ( res_vt1 , res_vt0 , res_vt3 , res_vt2 , out0 , out1 ) ;
PCKEV_D2_UB ( dst1 , dst0 , dst3 , dst2 , dst0 , dst1 ) ;
AVER_UB2_UB ( out0 , dst0 , out1 , dst1 , out0 , out1 ) ;
ST8x4_UB ( out0 , out1 , dst , dst_stride ) ;
dst + = ( 4 * dst_stride ) ;
res_hz0 = res_hz4 ;
static void avc_chroma_hv_and_aver_dst_8w_msa ( uint8_t * src , uint8_t * dst ,
int32_t stride ,
uint32_t coef_hor0 ,
uint32_t coef_hor1 ,
uint32_t coef_ver0 ,
uint32_t coef_ver1 ,
int32_t height )
{
if ( 4 = = height ) {
avc_chroma_hv_and_aver_dst_8x4_msa ( src , dst , stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
} else if ( 8 = = height ) {
avc_chroma_hv_and_aver_dst_8x8_msa ( src , dst , stride , coef_hor0 ,
coef_hor1 , coef_ver0 , coef_ver1 ) ;
}
}
@ -1923,8 +1964,7 @@ void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
if ( x & & y ) {
avc_chroma_hv_and_aver_dst_8w_msa ( src , stride , dst ,
stride , x , ( 8 - x ) , y ,
avc_chroma_hv_and_aver_dst_8w_msa ( src , dst , stride , x , ( 8 - x ) , y ,
( 8 - y ) , height ) ;
} else if ( x ) {
avc_chroma_hz_and_aver_dst_8w_msa ( src , dst , stride , x , ( 8 - x ) , height ) ;
@ -1941,8 +1981,7 @@ void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
av_assert2 ( x < 8 & & y < 8 & & x > = 0 & & y > = 0 ) ;
if ( x & & y ) {
avc_chroma_hv_and_aver_dst_4w_msa ( src , stride , dst ,
stride , x , ( 8 - x ) , y ,
avc_chroma_hv_and_aver_dst_4w_msa ( src , dst , stride , x , ( 8 - x ) , y ,
( 8 - y ) , height ) ;
} else if ( x ) {
avc_chroma_hz_and_aver_dst_4w_msa ( src , dst , stride , x , ( 8 - x ) , height ) ;
@ -1961,8 +2000,7 @@ void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
av_assert2 ( x < 8 & & y < 8 & & x > = 0 & & y > = 0 ) ;
if ( x & & y ) {
avc_chroma_hv_and_aver_dst_2w_msa ( src , stride , dst ,
stride , x , ( 8 - x ) , y ,
avc_chroma_hv_and_aver_dst_2w_msa ( src , dst , stride , x , ( 8 - x ) , y ,
( 8 - y ) , height ) ;
} else if ( x ) {
avc_chroma_hz_and_aver_dst_2w_msa ( src , dst , stride , x , ( 8 - x ) , height ) ;