@ -19,12 +19,14 @@
*/
*/
# include "libavutil/mem.h"
# include "libavutil/mem.h"
# include "libavutil/ppc/types_altivec.h"
# include "libavutil/ppc/util_altivec.h"
/* this code assume that stride % 16 == 0 */
/* this code assume that stride % 16 == 0 */
# define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
# define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
vsrc2ssH = ( vec_s16 ) vec_mergeh ( zero_u8v , ( vec_u8 ) vsrc2uc ) ; \
vsrc2ssH = ( vec_s16 ) VEC_MERGEH ( zero_u8v , ( vec_u8 ) vsrc2uc ) ; \
vsrc3ssH = ( vec_s16 ) vec_mergeh ( zero_u8v , ( vec_u8 ) vsrc3uc ) ; \
vsrc3ssH = ( vec_s16 ) VEC_MERGEH ( zero_u8v , ( vec_u8 ) vsrc3uc ) ; \
\
\
psum = vec_mladd ( vA , vsrc0ssH , BIAS1 ) ; \
psum = vec_mladd ( vA , vsrc0ssH , BIAS1 ) ; \
psum = vec_mladd ( vB , vsrc1ssH , psum ) ; \
psum = vec_mladd ( vB , vsrc1ssH , psum ) ; \
@ -49,8 +51,8 @@
# define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
# define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
\
\
vsrc0ssH = ( vec_s16 ) vec_mergeh ( zero_u8v , ( vec_u8 ) vsrc0uc ) ; \
vsrc0ssH = ( vec_s16 ) VEC_MERGEH ( zero_u8v , ( vec_u8 ) vsrc0uc ) ; \
vsrc1ssH = ( vec_s16 ) vec_mergeh ( zero_u8v , ( vec_u8 ) vsrc1uc ) ; \
vsrc1ssH = ( vec_s16 ) VEC_MERGEH ( zero_u8v , ( vec_u8 ) vsrc1uc ) ; \
\
\
psum = vec_mladd ( vA , vsrc0ssH , v32ss ) ; \
psum = vec_mladd ( vA , vsrc0ssH , v32ss ) ; \
psum = vec_mladd ( vE , vsrc1ssH , psum ) ; \
psum = vec_mladd ( vE , vsrc1ssH , psum ) ; \
@ -70,6 +72,43 @@
# define noop(a) a
# define noop(a) a
# define add28(a) vec_add(v28ss, a)
# define add28(a) vec_add(v28ss, a)
# if HAVE_BIGENDIAN
# define GET_VSRC1(vs0, off, b, perm0, s){ \
vec_u8 vsrcCuc , vsrcDuc ; \
vsrcCuc = vec_ld ( off , s ) ; \
if ( loadSecond ) { \
vsrcDuc = vec_ld ( off + b , s ) ; \
} else \
vsrcDuc = vsrcCuc ; \
\
vs0 = vec_perm ( vsrcCuc , vsrcDuc , perm0 ) ; \
}
# define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
vec_u8 vsrcCuc , vsrcDuc ; \
vsrcCuc = vec_ld ( off , s ) ; \
if ( loadSecond ) { \
vsrcDuc = vec_ld ( off + b , s ) ; \
} else \
vsrcDuc = vsrcCuc ; \
\
vs0 = vec_perm ( vsrcCuc , vsrcDuc , perm0 ) ; \
if ( reallyBadAlign ) { \
vs1 = vsrcDuc ; \
} else \
vs1 = vec_perm ( vsrcCuc , vsrcDuc , perm1 ) ; \
}
# else
# define GET_VSRC1(vs0, off, b, perm0, s){ \
vs0 = vec_vsx_ld ( off , s ) ; \
}
# define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
vs0 = vec_vsx_ld ( off , s ) ; \
vs1 = vec_vsx_ld ( off + 1 , s ) ; \
}
# endif /* HAVE_BIGENDIAN */
# ifdef PREFIX_h264_chroma_mc8_altivec
# ifdef PREFIX_h264_chroma_mc8_altivec
static void PREFIX_h264_chroma_mc8_altivec ( uint8_t * dst , uint8_t * src ,
static void PREFIX_h264_chroma_mc8_altivec ( uint8_t * dst , uint8_t * src ,
int stride , int h , int x , int y ) {
int stride , int h , int x , int y ) {
@ -80,23 +119,27 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
( ( x ) * ( y ) ) } ;
( ( x ) * ( y ) ) } ;
register int i ;
register int i ;
vec_u8 fperm ;
vec_u8 fperm ;
const vec_s32 vABCD = vec_ld ( 0 , ABCD ) ;
const vec_s16 vA = vec_splat ( ( vec_s16 ) vABCD , 1 ) ;
const vec_s16 vB = vec_splat ( ( vec_s16 ) vABCD , 3 ) ;
const vec_s16 vC = vec_splat ( ( vec_s16 ) vABCD , 5 ) ;
const vec_s16 vD = vec_splat ( ( vec_s16 ) vABCD , 7 ) ;
LOAD_ZERO ;
LOAD_ZERO ;
const vec_s32 vABCD = vec_ld ( 0 , ABCD ) ;
const vec_s16 vA = VEC_SPLAT16 ( vABCD , 1 ) ;
const vec_s16 vB = VEC_SPLAT16 ( vABCD , 3 ) ;
const vec_s16 vC = VEC_SPLAT16 ( vABCD , 5 ) ;
const vec_s16 vD = VEC_SPLAT16 ( vABCD , 7 ) ;
const vec_s16 v32ss = vec_sl ( vec_splat_s16 ( 1 ) , vec_splat_u16 ( 5 ) ) ;
const vec_s16 v32ss = vec_sl ( vec_splat_s16 ( 1 ) , vec_splat_u16 ( 5 ) ) ;
const vec_u16 v6us = vec_splat_u16 ( 6 ) ;
const vec_u16 v6us = vec_splat_u16 ( 6 ) ;
register int loadSecond = ( ( ( unsigned long ) src ) % 16 ) < = 7 ? 0 : 1 ;
register int reallyBadAlign = ( ( ( unsigned long ) src ) % 16 ) = = 15 ? 1 : 0 ;
vec_u8 vsrcAuc , av_uninit ( vsrcBuc ) , vsrc perm0 , vsrcperm1 ;
vec_u8 vsrcperm0 , vsrcperm1 ;
vec_u8 vsrc0uc , vsrc1uc ;
vec_u8 vsrc0uc , vsrc1uc ;
vec_s16 vsrc0ssH , vsrc1ssH ;
vec_s16 vsrc0ssH , vsrc1ssH ;
vec_u8 vsrcCuc , vsrc 2uc , vsrc3uc ;
vec_u8 vsrc2uc , vsrc3uc ;
vec_s16 vsrc2ssH , vsrc3ssH , psum ;
vec_s16 vsrc2ssH , vsrc3ssH , psum ;
vec_u8 vdst , ppsum , vfdst , fsum ;
vec_u8 vdst , ppsum , vfdst , fsum ;
# if HAVE_BIGENDIAN
register int loadSecond = ( ( ( unsigned long ) src ) % 16 ) < = 7 ? 0 : 1 ;
register int reallyBadAlign = ( ( ( unsigned long ) src ) % 16 ) = = 15 ? 1 : 0 ;
vsrcperm0 = vec_lvsl ( 0 , src ) ;
vsrcperm1 = vec_lvsl ( 1 , src ) ;
# endif
if ( ( ( unsigned long ) dst ) % 16 = = 0 ) {
if ( ( ( unsigned long ) dst ) % 16 = = 0 ) {
fperm = ( vec_u8 ) { 0x10 , 0x11 , 0x12 , 0x13 ,
fperm = ( vec_u8 ) { 0x10 , 0x11 , 0x12 , 0x13 ,
@ -110,89 +153,28 @@ static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
0x1C , 0x1D , 0x1E , 0x1F } ;
0x1C , 0x1D , 0x1E , 0x1F } ;
}
}
vsrcAuc = vec_ld ( 0 , src ) ;
GET_VSRC ( vsrc0uc , vsrc1uc , 0 , 16 , vsrcperm0 , vsrcperm1 , src ) ;
if ( loadSecond )
vsrcBuc = vec_ld ( 16 , src ) ;
vsrcperm0 = vec_lvsl ( 0 , src ) ;
vsrcperm1 = vec_lvsl ( 1 , src ) ;
vsrc0uc = vec_perm ( vsrcAuc , vsrcBuc , vsrcperm0 ) ;
if ( reallyBadAlign )
vsrc1uc = vsrcBuc ;
else
vsrc1uc = vec_perm ( vsrcAuc , vsrcBuc , vsrcperm1 ) ;
vsrc0ssH = ( vec_s16 ) vec_mergeh ( zero_u8v , ( vec_u8 ) vsrc0uc ) ;
vsrc0ssH = ( vec_s16 ) VEC_MERGEH ( zero_u8v , ( vec_u8 ) vsrc0uc ) ;
vsrc1ssH = ( vec_s16 ) vec_mergeh ( zero_u8v , ( vec_u8 ) vsrc1uc ) ;
vsrc1ssH = ( vec_s16 ) VEC_MERGEH ( zero_u8v , ( vec_u8 ) vsrc1uc ) ;
if ( ABCD [ 3 ] ) {
if ( ABCD [ 3 ] ) {
if ( ! loadSecond ) { // -> !reallyBadAlign
for ( i = 0 ; i < h ; i + + ) {
for ( i = 0 ; i < h ; i + + ) {
GET_VSRC ( vsrc2uc , vsrc3uc , stride , 16 , vsrcperm0 , vsrcperm1 , src ) ;
vsrcCuc = vec_ld ( stride + 0 , src ) ;
CHROMA_MC8_ALTIVEC_CORE ( v32ss , noop ) ;
vsrc2uc = vec_perm ( vsrcCuc , vsrcCuc , vsrcperm0 ) ;
vsrc3uc = vec_perm ( vsrcCuc , vsrcCuc , vsrcperm1 ) ;
CHROMA_MC8_ALTIVEC_CORE ( v32ss , noop )
}
} else {
vec_u8 vsrcDuc ;
for ( i = 0 ; i < h ; i + + ) {
vsrcCuc = vec_ld ( stride + 0 , src ) ;
vsrcDuc = vec_ld ( stride + 16 , src ) ;
vsrc2uc = vec_perm ( vsrcCuc , vsrcDuc , vsrcperm0 ) ;
if ( reallyBadAlign )
vsrc3uc = vsrcDuc ;
else
vsrc3uc = vec_perm ( vsrcCuc , vsrcDuc , vsrcperm1 ) ;
CHROMA_MC8_ALTIVEC_CORE ( v32ss , noop )
}
}
}
} else {
} else {
const vec_s16 vE = vec_add ( vB , vC ) ;
const vec_s16 vE = vec_add ( vB , vC ) ;
if ( ABCD [ 2 ] ) { // x == 0 B == 0
if ( ABCD [ 2 ] ) { // x == 0 B == 0
if ( ! loadSecond ) { // -> !reallyBadAlign
for ( i = 0 ; i < h ; i + + ) {
for ( i = 0 ; i < h ; i + + ) {
GET_VSRC1 ( vsrc1uc , stride , 15 , vsrcperm0 , src ) ;
vsrcCuc = vec_ld ( stride + 0 , src ) ;
CHROMA_MC8_ALTIVEC_CORE_SIMPLE ;
vsrc1uc = vec_perm ( vsrcCuc , vsrcCuc , vsrcperm0 ) ;
vsrc0uc = vsrc1uc ;
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrc0uc = vsrc1uc ;
}
} else {
vec_u8 vsrcDuc ;
for ( i = 0 ; i < h ; i + + ) {
vsrcCuc = vec_ld ( stride + 0 , src ) ;
vsrcDuc = vec_ld ( stride + 15 , src ) ;
vsrc1uc = vec_perm ( vsrcCuc , vsrcDuc , vsrcperm0 ) ;
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrc0uc = vsrc1uc ;
}
}
}
} else { // y == 0 C == 0
} else { // y == 0 C == 0
if ( ! loadSecond ) { // -> !reallyBadAlign
for ( i = 0 ; i < h ; i + + ) {
for ( i = 0 ; i < h ; i + + ) {
GET_VSRC ( vsrc0uc , vsrc1uc , 0 , 15 , vsrcperm0 , vsrcperm1 , src ) ;
vsrcCuc = vec_ld ( 0 , src ) ;
CHROMA_MC8_ALTIVEC_CORE_SIMPLE ;
vsrc0uc = vec_perm ( vsrcCuc , vsrcCuc , vsrcperm0 ) ;
vsrc1uc = vec_perm ( vsrcCuc , vsrcCuc , vsrcperm1 ) ;
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
}
} else {
vec_u8 vsrcDuc ;
for ( i = 0 ; i < h ; i + + ) {
vsrcCuc = vec_ld ( 0 , src ) ;
vsrcDuc = vec_ld ( 15 , src ) ;
vsrc0uc = vec_perm ( vsrcCuc , vsrcDuc , vsrcperm0 ) ;
if ( reallyBadAlign )
vsrc1uc = vsrcDuc ;
else
vsrc1uc = vec_perm ( vsrcCuc , vsrcDuc , vsrcperm1 ) ;
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
}
}
}
}
}
}
}
@ -209,23 +191,27 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, i
( ( x ) * ( y ) ) } ;
( ( x ) * ( y ) ) } ;
register int i ;
register int i ;
vec_u8 fperm ;
vec_u8 fperm ;
const vec_s32 vABCD = vec_ld ( 0 , ABCD ) ;
const vec_s16 vA = vec_splat ( ( vec_s16 ) vABCD , 1 ) ;
const vec_s16 vB = vec_splat ( ( vec_s16 ) vABCD , 3 ) ;
const vec_s16 vC = vec_splat ( ( vec_s16 ) vABCD , 5 ) ;
const vec_s16 vD = vec_splat ( ( vec_s16 ) vABCD , 7 ) ;
LOAD_ZERO ;
LOAD_ZERO ;
const vec_s32 vABCD = vec_ld ( 0 , ABCD ) ;
const vec_s16 vA = VEC_SPLAT16 ( vABCD , 1 ) ;
const vec_s16 vB = VEC_SPLAT16 ( vABCD , 3 ) ;
const vec_s16 vC = VEC_SPLAT16 ( vABCD , 5 ) ;
const vec_s16 vD = VEC_SPLAT16 ( vABCD , 7 ) ;
const vec_s16 v28ss = vec_sub ( vec_sl ( vec_splat_s16 ( 1 ) , vec_splat_u16 ( 5 ) ) , vec_splat_s16 ( 4 ) ) ;
const vec_s16 v28ss = vec_sub ( vec_sl ( vec_splat_s16 ( 1 ) , vec_splat_u16 ( 5 ) ) , vec_splat_s16 ( 4 ) ) ;
const vec_u16 v6us = vec_splat_u16 ( 6 ) ;
const vec_u16 v6us = vec_splat_u16 ( 6 ) ;
register int loadSecond = ( ( ( unsigned long ) src ) % 16 ) < = 7 ? 0 : 1 ;
register int reallyBadAlign = ( ( ( unsigned long ) src ) % 16 ) = = 15 ? 1 : 0 ;
vec_u8 vsrcAuc , av_uninit ( vsrcBuc ) , vsrc perm0 , vsrcperm1 ;
vec_u8 vsrcperm0 , vsrcperm1 ;
vec_u8 vsrc0uc , vsrc1uc ;
vec_u8 vsrc0uc , vsrc1uc ;
vec_s16 vsrc0ssH , vsrc1ssH ;
vec_s16 vsrc0ssH , vsrc1ssH ;
vec_u8 vsrcCuc , vsrc 2uc , vsrc3uc ;
vec_u8 vsrc2uc , vsrc3uc ;
vec_s16 vsrc2ssH , vsrc3ssH , psum ;
vec_s16 vsrc2ssH , vsrc3ssH , psum ;
vec_u8 vdst , ppsum , vfdst , fsum ;
vec_u8 vdst , ppsum , vfdst , fsum ;
# if HAVE_BIGENDIAN
register int loadSecond = ( ( ( unsigned long ) src ) % 16 ) < = 7 ? 0 : 1 ;
register int reallyBadAlign = ( ( ( unsigned long ) src ) % 16 ) = = 15 ? 1 : 0 ;
vsrcperm0 = vec_lvsl ( 0 , src ) ;
vsrcperm1 = vec_lvsl ( 1 , src ) ;
# endif
if ( ( ( unsigned long ) dst ) % 16 = = 0 ) {
if ( ( ( unsigned long ) dst ) % 16 = = 0 ) {
fperm = ( vec_u8 ) { 0x10 , 0x11 , 0x12 , 0x13 ,
fperm = ( vec_u8 ) { 0x10 , 0x11 , 0x12 , 0x13 ,
@ -239,47 +225,14 @@ static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, i
0x1C , 0x1D , 0x1E , 0x1F } ;
0x1C , 0x1D , 0x1E , 0x1F } ;
}
}
vsrcAuc = vec_ld ( 0 , src ) ;
GET_VSRC ( vsrc0uc , vsrc1uc , 0 , 16 , vsrcperm0 , vsrcperm1 , src ) ;
if ( loadSecond )
vsrcBuc = vec_ld ( 16 , src ) ;
vsrcperm0 = vec_lvsl ( 0 , src ) ;
vsrcperm1 = vec_lvsl ( 1 , src ) ;
vsrc0uc = vec_perm ( vsrcAuc , vsrcBuc , vsrcperm0 ) ;
if ( reallyBadAlign )
vsrc1uc = vsrcBuc ;
else
vsrc1uc = vec_perm ( vsrcAuc , vsrcBuc , vsrcperm1 ) ;
vsrc0ssH = ( vec_s16 ) vec_mergeh ( zero_u8v , ( vec_u8 ) vsrc0uc ) ;
vsrc1ssH = ( vec_s16 ) vec_mergeh ( zero_u8v , ( vec_u8 ) vsrc1uc ) ;
if ( ! loadSecond ) { // -> !reallyBadAlign
for ( i = 0 ; i < h ; i + + ) {
vsrcCuc = vec_ld ( stride + 0 , src ) ;
vsrc2uc = vec_perm ( vsrcCuc , vsrcCuc , vsrcperm0 ) ;
vsrc0ssH = ( vec_s16 ) VEC_MERGEH ( zero_u8v , ( vec_u8 ) vsrc0uc ) ;
vsrc3uc = vec_perm ( vsrcCuc , vsrcCuc , vsrcperm1 ) ;
vsrc1ssH = ( vec_s16 ) VEC_MERGEH ( zero_u8v , ( vec_u8 ) vsrc1uc ) ;
CHROMA_MC8_ALTIVEC_CORE ( vec_splat_s16 ( 0 ) , add28 )
for ( i = 0 ; i < h ; i + + ) {
}
GET_VSRC ( vsrc2uc , vsrc3uc , stride , 16 , vsrcperm0 , vsrcperm1 , src ) ;
} else {
CHROMA_MC8_ALTIVEC_CORE ( vec_splat_s16 ( 0 ) , add28 ) ;
vec_u8 vsrcDuc ;
for ( i = 0 ; i < h ; i + + ) {
vsrcCuc = vec_ld ( stride + 0 , src ) ;
vsrcDuc = vec_ld ( stride + 16 , src ) ;
vsrc2uc = vec_perm ( vsrcCuc , vsrcDuc , vsrcperm0 ) ;
if ( reallyBadAlign )
vsrc3uc = vsrcDuc ;
else
vsrc3uc = vec_perm ( vsrcCuc , vsrcDuc , vsrcperm1 ) ;
CHROMA_MC8_ALTIVEC_CORE ( vec_splat_s16 ( 0 ) , add28 )
}
}
}
}
}
# endif
# endif