@ -34,7 +34,9 @@ static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
int i ;
int s ;
const vector unsigned char zero = ( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
vector unsigned char * tv ;
vector unsigned char perm1 = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char perm2 = vec_add ( perm1 , vec_splat_u8 ( 1 ) ) ;
vector unsigned char pix2l , pix2r ;
vector unsigned char pix1v , pix2v , pix2iv , avgv , t5 ;
vector unsigned int sad ;
vector signed int sumdiffs ;
@ -45,14 +47,11 @@ static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
/* Read unaligned pixels into our vectors. The vectors are as follows:
pix1v : pix1 [ 0 ] - pix1 [ 15 ]
pix2v : pix2 [ 0 ] - pix2 [ 15 ] pix2iv : pix2 [ 1 ] - pix2 [ 16 ] */
tv = ( vector unsigned char * ) pix1 ;
pix1v = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , pix1 ) ) ;
tv = ( vector unsigned char * ) & pix2 [ 0 ] ;
pix2v = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , & pix2 [ 0 ] ) ) ;
tv = ( vector unsigned char * ) & pix2 [ 1 ] ;
pix2iv = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , & pix2 [ 1 ] ) ) ;
pix1v = vec_ld ( 0 , pix1 ) ;
pix2l = vec_ld ( 0 , pix2 ) ;
pix2r = vec_ld ( 16 , pix2 ) ;
pix2v = vec_perm ( pix2l , pix2r , perm1 ) ;
pix2iv = vec_perm ( pix2l , pix2r , perm2 ) ;
/* Calculate the average vector */
avgv = vec_avg ( pix2v , pix2iv ) ;
@ -79,7 +78,8 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
int i ;
int s ;
const vector unsigned char zero = ( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
vector unsigned char * tv ;
vector unsigned char perm = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char pix2l , pix2r ;
vector unsigned char pix1v , pix2v , pix3v , avgv , t5 ;
vector unsigned int sad ;
vector signed int sumdiffs ;
@ -95,18 +95,19 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
Read unaligned pixels into our vectors . The vectors are as follows :
pix2v : pix2 [ 0 ] - pix2 [ 15 ]
Split the pixel vectors into shorts */
tv = ( vector unsigned char * ) & pix2 [ 0 ] ;
pix2v = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , & pix2 [ 0 ] ) ) ;
pix2l = vec_ld ( 0 , pix2 ) ;
pix2r = vec_ld ( 15 , pix2 ) ;
pix2v = vec_perm ( pix2l , pix2r , perm ) ;
for ( i = 0 ; i < h ; i + + ) {
/* Read unaligned pixels into our vectors. The vectors are as follows:
pix1v : pix1 [ 0 ] - pix1 [ 15 ]
pix3v : pix3 [ 0 ] - pix3 [ 15 ] */
tv = ( vector unsigned char * ) pix1 ;
pix1v = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , pix1 ) ) ;
pix1v = vec_ld ( 0 , pix1 ) ;
tv = ( vector unsigned char * ) & pix3 [ 0 ] ;
pix3v = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , & pix3 [ 0 ] ) ) ;
pix2l = vec_ld ( 0 , pix3 ) ;
pix2r = vec_ld ( 15 , pix3 ) ;
pix3v = vec_perm ( pix2l , pix2r , perm ) ;
/* Calculate the average vector */
avgv = vec_avg ( pix2v , pix3v ) ;
@ -137,7 +138,10 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
uint8_t * pix3 = pix2 + line_size ;
const vector unsigned char zero = ( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
const vector unsigned short two = ( const vector unsigned short ) vec_splat_u16 ( 2 ) ;
vector unsigned char * tv , avgv , t5 ;
vector unsigned char avgv , t5 ;
vector unsigned char perm1 = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char perm2 = vec_add ( perm1 , vec_splat_u8 ( 1 ) ) ;
vector unsigned char pix2l , pix2r ;
vector unsigned char pix1v , pix2v , pix3v , pix2iv , pix3iv ;
vector unsigned short pix2lv , pix2hv , pix2ilv , pix2ihv ;
vector unsigned short pix3lv , pix3hv , pix3ilv , pix3ihv ;
@ -157,11 +161,10 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
Read unaligned pixels into our vectors . The vectors are as follows :
pix2v : pix2 [ 0 ] - pix2 [ 15 ] pix2iv : pix2 [ 1 ] - pix2 [ 16 ]
Split the pixel vectors into shorts */
tv = ( vector unsigned char * ) & pix2 [ 0 ] ;
pix2v = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , & pix2 [ 0 ] ) ) ;
tv = ( vector unsigned char * ) & pix2 [ 1 ] ;
pix2iv = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , & pix2 [ 1 ] ) ) ;
pix2l = vec_ld ( 0 , pix2 ) ;
pix2r = vec_ld ( 16 , pix2 ) ;
pix2v = vec_perm ( pix2l , pix2r , perm1 ) ;
pix2iv = vec_perm ( pix2l , pix2r , perm2 ) ;
pix2hv = ( vector unsigned short ) vec_mergeh ( zero , pix2v ) ;
pix2lv = ( vector unsigned short ) vec_mergel ( zero , pix2v ) ;
@ -174,14 +177,12 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
/* Read unaligned pixels into our vectors. The vectors are as follows:
pix1v : pix1 [ 0 ] - pix1 [ 15 ]
pix3v : pix3 [ 0 ] - pix3 [ 15 ] pix3iv : pix3 [ 1 ] - pix3 [ 16 ] */
tv = ( vector unsigned char * ) pix1 ;
pix1v = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , pix1 ) ) ;
tv = ( vector unsigned char * ) & pix3 [ 0 ] ;
pix3v = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , & pix3 [ 0 ] ) ) ;
pix1v = vec_ld ( 0 , pix1 ) ;
tv = ( vector unsigned char * ) & pix3 [ 1 ] ;
pix3iv = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , & pix3 [ 1 ] ) ) ;
pix2l = vec_ld ( 0 , pix3 ) ;
pix2r = vec_ld ( 16 , pix3 ) ;
pix3v = vec_perm ( pix2l , pix2r , perm1 ) ;
pix3iv = vec_perm ( pix2l , pix2r , perm2 ) ;
/* Note that AltiVec does have vec_avg, but this works on vector pairs
and rounds up . We could do avg ( avg ( a , b ) , avg ( c , d ) ) , but the rounding
@ -230,7 +231,7 @@ static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
int i ;
int s ;
const vector unsigned int zero = ( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char perm1 , perm2 , pix1v_low , pix1v_high , pix2v_low , pix2v_high ;
vector unsigned char perm = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char t1 , t2 , t3 , t4 , t5 ;
vector unsigned int sad ;
vector signed int sumdiffs ;
@ -240,14 +241,10 @@ static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
for ( i = 0 ; i < h ; i + + ) {
/* Read potentially unaligned pixels into t1 and t2 */
perm1 = vec_lvsl ( 0 , pix1 ) ;
pix1v_high = vec_ld ( 0 , pix1 ) ;
pix1v_low = vec_ld ( 15 , pix1 ) ;
perm2 = vec_lvsl ( 0 , pix2 ) ;
pix2v_high = vec_ld ( 0 , pix2 ) ;
pix2v_low = vec_ld ( 15 , pix2 ) ;
t1 = vec_perm ( pix1v_high , pix1v_low , perm1 ) ;
t2 = vec_perm ( pix2v_high , pix2v_low , perm2 ) ;
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 15 , pix2 ) ;
t1 = vec_ld ( 0 , pix1 ) ;
t2 = vec_perm ( pix2l , pix2r , perm ) ;
/* Calculate a sum of abs differences vector */
t3 = vec_max ( t1 , t2 ) ;
@ -274,25 +271,25 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
int i ;
int s ;
const vector unsigned int zero = ( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char perm1 , perm2 , permclear , * pix1v , * pix2v ;
const vector unsigned char permclear = ( vector unsigned char ) { 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ;
vector unsigned char perm1 = vec_lvsl ( 0 , pix1 ) ;
vector unsigned char perm2 = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char t1 , t2 , t3 , t4 , t5 ;
vector unsigned int sad ;
vector signed int sumdiffs ;
sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
permclear = ( vector unsigned char ) { 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ;
for ( i = 0 ; i < h ; i + + ) {
/* Read potentially unaligned pixels into t1 and t2
Since we ' re reading 16 pixels , and actually only want 8 ,
mask out the last 8 pixels . The 0 s don ' t change the sum . */
perm1 = vec_lvsl ( 0 , pix1 ) ;
pix1v = ( vector unsigned char * ) pix1 ;
perm2 = vec_lvsl ( 0 , pix2 ) ;
pix2v = ( vector unsigned char * ) pix2 ;
t1 = vec_and ( vec_perm ( pix1v [ 0 ] , pix1v [ 1 ] , perm1 ) , permclear ) ;
t2 = vec_and ( vec_perm ( pix2v [ 0 ] , pix2v [ 1 ] , perm2 ) , permclear ) ;
vector unsigned char pix1l = vec_ld ( 0 , pix1 ) ;
vector unsigned char pix1r = vec_ld ( 15 , pix1 ) ;
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 15 , pix2 ) ;
t1 = vec_and ( vec_perm ( pix1l , pix1r , perm1 ) , permclear ) ;
t2 = vec_and ( vec_perm ( pix2l , pix2r , perm2 ) , permclear ) ;
/* Calculate a sum of abs differences vector */
t3 = vec_max ( t1 , t2 ) ;
@ -319,7 +316,7 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
int i ;
int s ;
const vector unsigned int zero = ( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char * tv ;
vector unsigned char perm = vec_lvsl ( 0 , pix ) ;
vector unsigned char pixv ;
vector unsigned int sv ;
vector signed int sum ;
@ -329,8 +326,9 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
s = 0 ;
for ( i = 0 ; i < 16 ; i + + ) {
/* Read in the potentially unaligned pixels */
tv = ( vector unsigned char * ) pix ;
pixv = vec_perm ( tv [ 0 ] , tv [ 1 ] , vec_lvsl ( 0 , pix ) ) ;
vector unsigned char pixl = vec_ld ( 0 , pix ) ;
vector unsigned char pixr = vec_ld ( 15 , pix ) ;
pixv = vec_perm ( pixl , pixr , perm ) ;
/* Square the values, and add them to our sum */
sv = vec_msum ( pixv , pixv , sv ) ;
@ -355,26 +353,25 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
int i ;
int s ;
const vector unsigned int zero = ( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char perm1 , perm2 , permclear , * pix1v , * pix2v ;
const vector unsigned char permclear = ( vector unsigned char ) { 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ;
vector unsigned char perm1 = vec_lvsl ( 0 , pix1 ) ;
vector unsigned char perm2 = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char t1 , t2 , t3 , t4 , t5 ;
vector unsigned int sum ;
vector signed int sumsqr ;
sum = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
permclear = ( vector unsigned char ) { 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ;
for ( i = 0 ; i < h ; i + + ) {
/* Read potentially unaligned pixels into t1 and t2
Since we ' re reading 16 pixels , and actually only want 8 ,
mask out the last 8 pixels . The 0 s don ' t change the sum . */
perm1 = vec_lvsl ( 0 , pix1 ) ;
pix1v = ( vector unsigned char * ) pix1 ;
perm2 = vec_lvsl ( 0 , pix2 ) ;
pix2v = ( vector unsigned char * ) pix2 ;
t1 = vec_and ( vec_perm ( pix1v [ 0 ] , pix1v [ 1 ] , perm1 ) , permclear ) ;
t2 = vec_and ( vec_perm ( pix2v [ 0 ] , pix2v [ 1 ] , perm2 ) , permclear ) ;
vector unsigned char pix1l = vec_ld ( 0 , pix1 ) ;
vector unsigned char pix1r = vec_ld ( 15 , pix1 ) ;
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 15 , pix2 ) ;
t1 = vec_and ( vec_perm ( pix1l , pix1r , perm1 ) , permclear ) ;
t2 = vec_and ( vec_perm ( pix2l , pix2r , perm2 ) , permclear ) ;
/* Since we want to use unsigned chars, we can take advantage
of the fact that abs ( a - b ) ^ 2 = ( a - b ) ^ 2. */
@ -409,7 +406,7 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
int i ;
int s ;
const vector unsigned int zero = ( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char perm1 , perm2 , * pix1v , * pix2v ;
vector unsigned char perm = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char t1 , t2 , t3 , t4 , t5 ;
vector unsigned int sum ;
vector signed int sumsqr ;
@ -418,12 +415,10 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
for ( i = 0 ; i < h ; i + + ) {
/* Read potentially unaligned pixels into t1 and t2 */
perm1 = vec_lvsl ( 0 , pix1 ) ;
pix1v = ( vector unsigned char * ) pix1 ;
perm2 = vec_lvsl ( 0 , pix2 ) ;
pix2v = ( vector unsigned char * ) pix2 ;
t1 = vec_perm ( pix1v [ 0 ] , pix1v [ 1 ] , perm1 ) ;
t2 = vec_perm ( pix2v [ 0 ] , pix2v [ 1 ] , perm2 ) ;
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 15 , pix2 ) ;
t1 = vec_ld ( 0 , pix1 ) ;
t2 = vec_perm ( pix2l , pix2r , perm ) ;
/* Since we want to use unsigned chars, we can take advantage
of the fact that abs ( a - b ) ^ 2 = ( a - b ) ^ 2. */
@ -451,7 +446,7 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
static int pix_sum_altivec ( uint8_t * pix , int line_size )
{
const vector unsigned int zero = ( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char perm , * pixv ;
vector unsigned char perm = vec_lvsl ( 0 , pix ) ;
vector unsigned char t1 ;
vector unsigned int sad ;
vector signed int sumdiffs ;
@ -463,9 +458,9 @@ static int pix_sum_altivec(uint8_t * pix, int line_size)
for ( i = 0 ; i < 16 ; i + + ) {
/* Read the potentially unaligned 16 pixels into t1 */
perm = vec_lvsl ( 0 , pix ) ;
pixv = ( vector unsigned char * ) pix ;
t1 = vec_perm ( pixv [ 0 ] , pixv [ 1 ] , perm ) ;
vector unsigned char pixl = vec_ld ( 0 , pix ) ;
vector unsigned char pixr = vec_ld ( 15 , pix ) ;
t1 = vec_perm ( pixl , pixr , perm ) ;
/* Add each 4 pixel group together and put 4 results into sad */
sad = vec_sum4s ( t1 , sad ) ;
@ -484,7 +479,8 @@ static int pix_sum_altivec(uint8_t * pix, int line_size)
static void get_pixels_altivec ( DCTELEM * restrict block , const uint8_t * pixels , int line_size )
{
int i ;
vector unsigned char perm , bytes , * pixv ;
vector unsigned char perm = vec_lvsl ( 0 , pixels ) ;
vector unsigned char bytes ;
const vector unsigned char zero = ( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
vector signed short shorts ;
@ -492,9 +488,9 @@ static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, i
// Read potentially unaligned pixels.
// We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras.
perm = vec_lvsl ( 0 , pixels ) ;
pixv = ( vector unsigned char * ) pixels ;
bytes = vec_perm ( pixv [ 0 ] , pixv [ 1 ] , perm ) ;
vector unsigned char pixl = vec_ld ( 0 , pixels ) ;
vector unsigned char pixr = vec_ld ( 15 , pixels ) ;
bytes = vec_perm ( pixl , pixr , perm ) ;
// convert the bytes into shorts
shorts = ( vector signed short ) vec_mergeh ( zero , bytes ) ;
@ -510,7 +506,9 @@ static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
const uint8_t * s2 , int stride )
{
int i ;
vector unsigned char perm , bytes , * pixv ;
vector unsigned char perm1 = vec_lvsl ( 0 , s1 ) ;
vector unsigned char perm2 = vec_lvsl ( 0 , s2 ) ;
vector unsigned char bytes , pixl , pixr ;
const vector unsigned char zero = ( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
vector signed short shorts1 , shorts2 ;
@ -518,17 +516,17 @@ static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
// Read potentially unaligned pixels
// We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras.
perm = vec_lvsl ( 0 , s1 ) ;
pixv = ( vector unsigned char * ) s1 ;
bytes = vec_perm ( pixv [ 0 ] , pixv [ 1 ] , perm ) ;
pixl = vec_ld ( 0 , s1 ) ;
pixr = vec_ld ( 15 , s1 ) ;
bytes = vec_perm ( pixl , pixr , perm1 ) ;
// convert the bytes into shorts
shorts1 = ( vector signed short ) vec_mergeh ( zero , bytes ) ;
// Do the same for the second block of pixels
perm = vec_lvsl ( 0 , s2 ) ;
pixv = ( vector unsigned char * ) s2 ;
bytes = vec_perm ( pixv [ 0 ] , pixv [ 1 ] , perm ) ;
pixl = vec_ld ( 0 , s2 ) ;
pixr = vec_ld ( 15 , s2 ) ;
bytes = vec_perm ( pixl , pixr , perm2 ) ;
// convert the bytes into shorts
shorts2 = ( vector signed short ) vec_mergeh ( zero , bytes ) ;
@ -550,17 +548,17 @@ static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
// Read potentially unaligned pixels
// We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras.
perm = vec_lvsl ( 0 , s1 ) ;
pixv = ( vector unsigned char * ) s1 ;
bytes = vec_perm ( pixv [ 0 ] , pixv [ 1 ] , perm ) ;
pixl = vec_ld ( 0 , s1 ) ;
pixr = vec_ld ( 15 , s1 ) ;
bytes = vec_perm ( pixl , pixr , perm1 ) ;
// convert the bytes into shorts
shorts1 = ( vector signed short ) vec_mergeh ( zero , bytes ) ;
// Do the same for the second block of pixels
perm = vec_lvsl ( 0 , s2 ) ;
pixv = ( vector unsigned char * ) s2 ;
bytes = vec_perm ( pixv [ 0 ] , pixv [ 1 ] , perm ) ;
pixl = vec_ld ( 0 , s2 ) ;
pixr = vec_ld ( 15 , s2 ) ;
bytes = vec_perm ( pixl , pixr , perm2 ) ;
// convert the bytes into shorts
shorts2 = ( vector signed short ) vec_mergeh ( zero , bytes ) ;