@ -62,10 +62,17 @@
b2 = vec_mergeh ( a1 , a3 ) ; \
b3 = vec_mergel ( a1 , a3 )
# define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
# if HAVE_BIGENDIAN
# define vdst_load(d) \
vdst_orig = vec_ld ( 0 , dst ) ; \
vdst = vec_perm ( vdst_orig , zero_u8v , vdst_mask ) ; \
vdst_ss = ( vec_s16 ) vec_mergeh ( zero_u8v , vdst ) ; \
vdst = vec_perm ( vdst_orig , zero_u8v , vdst_mask ) ;
# else
# define vdst_load(d) vdst = vec_vsx_ld(0, dst)
# endif
# define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
vdst_load ( ) ; \
vdst_ss = ( vec_s16 ) VEC_MERGEH ( zero_u8v , vdst ) ; \
va = vec_add ( va , vdst_ss ) ; \
va_u8 = vec_packsu ( va , zero_s16v ) ; \
va_u32 = vec_splat ( ( vec_u32 ) va_u8 , 0 ) ; \
@ -165,26 +172,43 @@ static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
d7 = vec_sub ( b0v , b7v ) ; \
}
# if HAVE_BIGENDIAN
# define GET_2PERM(ldv, stv, d) \
ldv = vec_lvsl ( 0 , d ) ; \
stv = vec_lvsr ( 8 , d ) ;
# define dstv_load(d) \
vec_u8 hv = vec_ld ( 0 , d ) ; \
vec_u8 lv = vec_ld ( 7 , d ) ; \
vec_u8 dstv = vec_perm ( hv , lv , ( vec_u8 ) perm_ldv ) ;
# define dest_unligned_store(d) \
vec_u8 edgehv ; \
vec_u8 bodyv = vec_perm ( idstsum8 , idstsum8 , perm_stv ) ; \
vec_u8 edgelv = vec_perm ( sel , zero_u8v , perm_stv ) ; \
lv = vec_sel ( lv , bodyv , edgelv ) ; \
vec_st ( lv , 7 , d ) ; \
hv = vec_ld ( 0 , d ) ; \
edgehv = vec_perm ( zero_u8v , sel , perm_stv ) ; \
hv = vec_sel ( hv , bodyv , edgehv ) ; \
vec_st ( hv , 0 , d ) ;
# else
# define GET_2PERM(ldv, stv, d) {}
# define dstv_load(d) vec_u8 dstv = vec_vsx_ld(0, d)
# define dest_unligned_store(d)\
vec_u8 dst8 = vec_perm ( ( vec_u8 ) idstsum8 , dstv , vcprm ( 2 , 3 , s2 , s3 ) ) ; \
vec_vsx_st ( dst8 , 0 , d )
# endif /* HAVE_BIGENDIAN */
# define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
/* unaligned load */ \
vec_u8 hv = vec_ld ( 0 , dest ) ; \
vec_u8 lv = vec_ld ( 7 , dest ) ; \
vec_u8 dstv = vec_perm ( hv , lv , ( vec_u8 ) perm_ldv ) ; \
dstv_load ( dest ) ; \
vec_s16 idct_sh6 = vec_sra ( idctv , sixv ) ; \
vec_u16 dst16 = ( vec_u16 ) vec_mergeh ( zero_u8v , dstv ) ; \
vec_u16 dst16 = ( vec_u16 ) VEC_MERGEH ( zero_u8v , dstv ) ; \
vec_s16 idstsum = vec_adds ( idct_sh6 , ( vec_s16 ) dst16 ) ; \
vec_u8 idstsum8 = vec_packsu ( zero_s16v , idstsum ) ; \
vec_u8 edgehv ; \
/* unaligned store */ \
vec_u8 bodyv = vec_perm ( idstsum8 , idstsum8 , perm_stv ) ; \
vec_u8 edgelv = vec_perm ( sel , zero_u8v , perm_stv ) ; \
lv = vec_sel ( lv , bodyv , edgelv ) ; \
vec_st ( lv , 7 , dest ) ; \
hv = vec_ld ( 0 , dest ) ; \
edgehv = vec_perm ( zero_u8v , sel , perm_stv ) ; \
hv = vec_sel ( hv , bodyv , edgehv ) ; \
vec_st ( hv , 0 , dest ) ; \
}
dest_unligned_store ( dest ) ; \
}
static void h264_idct8_add_altivec ( uint8_t * dst , int16_t * dct , int stride )
{
@ -192,8 +216,8 @@ static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
vec_s16 d0 , d1 , d2 , d3 , d4 , d5 , d6 , d7 ;
vec_s16 idct0 , idct1 , idct2 , idct3 , idct4 , idct5 , idct6 , idct7 ;
vec_u8 perm_ldv = vec_lvsl ( 0 , dst ) ;
vec_u8 perm_stv = vec_lvsr ( 8 , dst ) ;
vec_u8 perm_ldv , perm_stv ;
GET_2PERM ( perm_ldv , perm_stv , dst ) ;
const vec_u16 onev = vec_splat_u16 ( 1 ) ;
const vec_u16 twov = vec_splat_u16 ( 2 ) ;
@ -236,20 +260,25 @@ static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *bl
{
vec_s16 dc16 ;
vec_u8 dcplus , dcminus , v0 , v1 , v2 , v3 , aligner ;
vec_s32 v_dc32 ;
LOAD_ZERO ;
DECLARE_ALIGNED ( 16 , int , dc ) ;
int i ;
dc = ( block [ 0 ] + 32 ) > > 6 ;
block [ 0 ] = 0 ;
dc16 = vec_splat ( ( vec_s16 ) vec_lde ( 0 , & dc ) , 1 ) ;
v_dc32 = vec_lde ( 0 , & dc ) ;
dc16 = VEC_SPLAT16 ( ( vec_s16 ) v_dc32 , 1 ) ;
if ( size = = 4 )
dc16 = vec_sld ( dc16 , zero_s16v , 8 ) ;
dc16 = VEC_SLD16 ( dc16 , zero_s16v , 8 ) ;
dcplus = vec_packsu ( dc16 , zero_s16v ) ;
dcminus = vec_packsu ( vec_sub ( zero_s16v , dc16 ) , zero_s16v ) ;
aligner = vec_lvsr ( 0 , dst ) ;
# if !HAVE_BIGENDIAN
aligner = vec_perm ( aligner , zero_u8v , vcswapc ( ) ) ;
# endif
dcplus = vec_perm ( dcplus , dcplus , aligner ) ;
dcminus = vec_perm ( dcminus , dcminus , aligner ) ;
@ -633,6 +662,9 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height,
temp [ 2 ] = offset ;
vtemp = ( vec_s16 ) vec_ld ( 0 , temp ) ;
# if !HAVE_BIGENDIAN
vtemp = ( vec_s16 ) vec_perm ( vtemp , vtemp , vcswapi2s ( 0 , 1 , 2 , 3 ) ) ;
# endif
vlog2_denom = ( vec_u16 ) vec_splat ( vtemp , 1 ) ;
vweight = vec_splat ( vtemp , 3 ) ;
voffset = vec_splat ( vtemp , 5 ) ;
@ -641,8 +673,8 @@ void weight_h264_W_altivec(uint8_t *block, int stride, int height,
for ( y = 0 ; y < height ; y + + ) {
vblock = vec_ld ( 0 , block ) ;
v0 = ( vec_s16 ) vec_mergeh ( zero_u8v , vblock ) ;
v1 = ( vec_s16 ) vec_mergel ( zero_u8v , vblock ) ;
v0 = ( vec_s16 ) VEC_MERGEH ( zero_u8v , vblock ) ;
v1 = ( vec_s16 ) VEC_MERGEL ( zero_u8v , vblock ) ;
if ( w = = 16 | | aligned ) {
v0 = vec_mladd ( v0 , vweight , zero_s16v ) ;
@ -679,6 +711,9 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
temp [ 3 ] = offset ;
vtemp = ( vec_s16 ) vec_ld ( 0 , temp ) ;
# if !HAVE_BIGENDIAN
vtemp = ( vec_s16 ) vec_perm ( vtemp , vtemp , vcswapi2s ( 0 , 1 , 2 , 3 ) ) ;
# endif
vlog2_denom = ( vec_u16 ) vec_splat ( vtemp , 1 ) ;
vweights = vec_splat ( vtemp , 3 ) ;
vweightd = vec_splat ( vtemp , 5 ) ;
@ -690,10 +725,10 @@ void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
vdst = vec_ld ( 0 , dst ) ;
vsrc = vec_ld ( 0 , src ) ;
v0 = ( vec_s16 ) vec_mergeh ( zero_u8v , vdst ) ;
v1 = ( vec_s16 ) vec_mergel ( zero_u8v , vdst ) ;
v2 = ( vec_s16 ) vec_mergeh ( zero_u8v , vsrc ) ;
v3 = ( vec_s16 ) vec_mergel ( zero_u8v , vsrc ) ;
v0 = ( vec_s16 ) VEC_MERGEH ( zero_u8v , vdst ) ;
v1 = ( vec_s16 ) VEC_MERGEL ( zero_u8v , vdst ) ;
v2 = ( vec_s16 ) VEC_MERGEH ( zero_u8v , vsrc ) ;
v3 = ( vec_s16 ) VEC_MERGEL ( zero_u8v , vsrc ) ;
if ( w = = 8 ) {
if ( src_aligned )