@ -35,34 +35,30 @@
static int sad16_x2_altivec ( void * v , uint8_t * pix1 , uint8_t * pix2 ,
int line_size , int h )
{
int i ;
int s ;
int i , s = 0 ;
const vector unsigned char zero =
( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
vector unsigned char perm1 = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char perm2 = vec_add ( perm1 , vec_splat_u8 ( 1 ) ) ;
vector unsigned char pix2l , pix2r ;
vector unsigned char pix1v , pix2v , pix2iv , avgv , t5 ;
vector unsigned int sad ;
vector unsigned int sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector signed int sumdiffs ;
s = 0 ;
sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
for ( i = 0 ; i < h ; i + + ) {
/* Read unaligned pixels into our vectors. The vectors are as follows:
* pix1v : pix1 [ 0 ] - pix1 [ 15 ]
* pix2v : pix2 [ 0 ] - pix2 [ 15 ] pix2iv : pix2 [ 1 ] - pix2 [ 16 ] */
pix1v = vec_ld ( 0 , pix1 ) ;
pix2l = vec_ld ( 0 , pix2 ) ;
pix2r = vec_ld ( 16 , pix2 ) ;
pix2v = vec_perm ( pix2l , pix2r , perm1 ) ;
pix2iv = vec_perm ( pix2l , pix2r , perm2 ) ;
vector unsigned char pix1v = vec_ld ( 0 , pix1 ) ;
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 16 , pix2 ) ;
vector unsigned char pix2v = vec_perm ( pix2l , pix2r , perm1 ) ;
vector unsigned char pix2iv = vec_perm ( pix2l , pix2r , perm2 ) ;
/* Calculate the average vector. */
avgv = vec_avg ( pix2v , pix2iv ) ;
vector unsigned char avgv = vec_avg ( pix2v , pix2iv ) ;
/* Calculate a sum of abs differences vector. */
t5 = vec_sub ( vec_max ( pix1v , avgv ) , vec_min ( pix1v , avgv ) ) ;
vector unsigned char t5 = vec_sub ( vec_max ( pix1v , avgv ) ,
vec_min ( pix1v , avgv ) ) ;
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s ( t5 , sad ) ;
@ -81,20 +77,15 @@ static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
static int sad16_y2_altivec ( void * v , uint8_t * pix1 , uint8_t * pix2 ,
int line_size , int h )
{
int i ;
int s ;
int i , s = 0 ;
const vector unsigned char zero =
( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
vector unsigned char perm = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char pix2l , pix2r ;
vector unsigned char pix1v , pix2v , pix3v , avgv , t5 ;
vector unsigned int sad ;
vector unsigned char pix1v , pix3v , avgv , t5 ;
vector unsigned int sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector signed int sumdiffs ;
uint8_t * pix3 = pix2 + line_size ;
s = 0 ;
sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
* iteration becomes pix2 in the next iteration . We can use this
* fact to avoid a potentially expensive unaligned read , each
@ -102,9 +93,9 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
* Read unaligned pixels into our vectors . The vectors are as follows :
* pix2v : pix2 [ 0 ] - pix2 [ 15 ]
* Split the pixel vectors into shorts . */
pix2l = vec_ld ( 0 , pix2 ) ;
pix2r = vec_ld ( 15 , pix2 ) ;
pix2v = vec_perm ( pix2l , pix2r , perm ) ;
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 15 , pix2 ) ;
vector unsigned char pix2v = vec_perm ( pix2l , pix2r , perm ) ;
for ( i = 0 ; i < h ; i + + ) {
/* Read unaligned pixels into our vectors. The vectors are as follows:
@ -140,8 +131,7 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
static int sad16_xy2_altivec ( void * v , uint8_t * pix1 , uint8_t * pix2 ,
int line_size , int h )
{
int i ;
int s ;
int i , s = 0 ;
uint8_t * pix3 = pix2 + line_size ;
const vector unsigned char zero =
( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
@ -150,19 +140,12 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
vector unsigned char avgv , t5 ;
vector unsigned char perm1 = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char perm2 = vec_add ( perm1 , vec_splat_u8 ( 1 ) ) ;
vector unsigned char pix2l , pix2r ;
vector unsigned char pix1v , pix2v , pix3v , pix2iv , pix3iv ;
vector unsigned short pix2lv , pix2hv , pix2ilv , pix2ihv ;
vector unsigned char pix1v , pix3v , pix3iv ;
vector unsigned short pix3lv , pix3hv , pix3ilv , pix3ihv ;
vector unsigned short avghv , avglv ;
vector unsigned short t1 , t2 , t3 , t4 ;
vector unsigned int sad ;
vector unsigned int sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector signed int sumdiffs ;
sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
s = 0 ;
/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
* iteration becomes pix2 in the next iteration . We can use this
* fact to avoid a potentially expensive unaligned read , as well
@ -170,17 +153,22 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
* Read unaligned pixels into our vectors . The vectors are as follows :
* pix2v : pix2 [ 0 ] - pix2 [ 15 ] pix2iv : pix2 [ 1 ] - pix2 [ 16 ]
* Split the pixel vectors into shorts . */
pix2l = vec_ld ( 0 , pix2 ) ;
pix2r = vec_ld ( 16 , pix2 ) ;
pix2v = vec_perm ( pix2l , pix2r , perm1 ) ;
pix2iv = vec_perm ( pix2l , pix2r , perm2 ) ;
pix2hv = ( vector unsigned short ) vec_mergeh ( zero , pix2v ) ;
pix2lv = ( vector unsigned short ) vec_mergel ( zero , pix2v ) ;
pix2ihv = ( vector unsigned short ) vec_mergeh ( zero , pix2iv ) ;
pix2ilv = ( vector unsigned short ) vec_mergel ( zero , pix2iv ) ;
t1 = vec_add ( pix2hv , pix2ihv ) ;
t2 = vec_add ( pix2lv , pix2ilv ) ;
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 16 , pix2 ) ;
vector unsigned char pix2v = vec_perm ( pix2l , pix2r , perm1 ) ;
vector unsigned char pix2iv = vec_perm ( pix2l , pix2r , perm2 ) ;
vector unsigned short pix2hv =
( vector unsigned short ) vec_mergeh ( zero , pix2v ) ;
vector unsigned short pix2lv =
( vector unsigned short ) vec_mergel ( zero , pix2v ) ;
vector unsigned short pix2ihv =
( vector unsigned short ) vec_mergeh ( zero , pix2iv ) ;
vector unsigned short pix2ilv =
( vector unsigned short ) vec_mergel ( zero , pix2iv ) ;
vector unsigned short t1 = vec_add ( pix2hv , pix2ihv ) ;
vector unsigned short t2 = vec_add ( pix2lv , pix2ilv ) ;
vector unsigned short t3 , t4 ;
for ( i = 0 ; i < h ; i + + ) {
/* Read unaligned pixels into our vectors. The vectors are as follows:
@ -238,28 +226,24 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
static int sad16_altivec ( void * v , uint8_t * pix1 , uint8_t * pix2 ,
int line_size , int h )
{
int i ;
int s ;
int i , s ;
const vector unsigned int zero =
( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char perm = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char t1 , t2 , t3 , t4 , t5 ;
vector unsigned int sad ;
vector unsigned int sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector signed int sumdiffs ;
sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
for ( i = 0 ; i < h ; i + + ) {
/* Read potentially unaligned pixels into t1 and t2. */
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 15 , pix2 ) ;
t1 = vec_ld ( 0 , pix1 ) ;
t2 = vec_perm ( pix2l , pix2r , perm ) ;
vector unsigned char t1 = vec_ld ( 0 , pix1 ) ;
vector unsigned char t2 = vec_perm ( pix2l , pix2r , perm ) ;
/* Calculate a sum of abs differences vector. */
t3 = vec_max ( t1 , t2 ) ;
t4 = vec_min ( t1 , t2 ) ;
t5 = vec_sub ( t3 , t4 ) ;
vector unsigned char t3 = vec_max ( t1 , t2 ) ;
vector unsigned char t4 = vec_min ( t1 , t2 ) ;
vector unsigned char t5 = vec_sub ( t3 , t4 ) ;
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s ( t5 , sad ) ;
@ -279,8 +263,7 @@ static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
static int sad8_altivec ( void * v , uint8_t * pix1 , uint8_t * pix2 ,
int line_size , int h )
{
int i ;
int s ;
int i , s ;
const vector unsigned int zero =
( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
const vector unsigned char permclear =
@ -288,12 +271,9 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
{ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ;
vector unsigned char perm1 = vec_lvsl ( 0 , pix1 ) ;
vector unsigned char perm2 = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char t1 , t2 , t3 , t4 , t5 ;
vector unsigned int sad ;
vector unsigned int sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector signed int sumdiffs ;
sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
for ( i = 0 ; i < h ; i + + ) {
/* Read potentially unaligned pixels into t1 and t2.
* Since we ' re reading 16 pixels , and actually only want 8 ,
@ -302,13 +282,15 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
vector unsigned char pix1r = vec_ld ( 7 , pix1 ) ;
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 7 , pix2 ) ;
t1 = vec_and ( vec_perm ( pix1l , pix1r , perm1 ) , permclear ) ;
t2 = vec_and ( vec_perm ( pix2l , pix2r , perm2 ) , permclear ) ;
vector unsigned char t1 = vec_and ( vec_perm ( pix1l , pix1r , perm1 ) ,
permclear ) ;
vector unsigned char t2 = vec_and ( vec_perm ( pix2l , pix2r , perm2 ) ,
permclear ) ;
/* Calculate a sum of abs differences vector. */
t3 = vec_max ( t1 , t2 ) ;
t4 = vec_min ( t1 , t2 ) ;
t5 = vec_sub ( t3 , t4 ) ;
vector unsigned char t3 = vec_max ( t1 , t2 ) ;
vector unsigned char t4 = vec_min ( t1 , t2 ) ;
vector unsigned char t5 = vec_sub ( t3 , t4 ) ;
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s ( t5 , sad ) ;
@ -327,23 +309,18 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
static int pix_norm1_altivec ( uint8_t * pix , int line_size )
{
int i ;
int s ;
int i , s = 0 ;
const vector unsigned int zero =
( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char perm = vec_lvsl ( 0 , pix ) ;
vector unsigned char pixv ;
vector unsigned int sv ;
vector unsigned int sv = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector signed int sum ;
sv = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
s = 0 ;
for ( i = 0 ; i < 16 ; i + + ) {
/* Read the potentially unaligned pixels. */
vector unsigned char pixl = vec_ld ( 0 , pix ) ;
vector unsigned char pixr = vec_ld ( 15 , pix ) ;
pixv = vec_perm ( pixl , pixr , perm ) ;
vector unsigned char pixv = vec_perm ( pixl , pixr , perm ) ;
/* Square the values, and add them to our sum. */
sv = vec_msum ( pixv , pixv , sv ) ;
@ -363,8 +340,7 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
static int sse8_altivec ( void * v , uint8_t * pix1 , uint8_t * pix2 ,
int line_size , int h )
{
int i ;
int s ;
int i , s ;
const vector unsigned int zero =
( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
const vector unsigned char permclear =
@ -372,12 +348,9 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
{ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ;
vector unsigned char perm1 = vec_lvsl ( 0 , pix1 ) ;
vector unsigned char perm2 = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char t1 , t2 , t3 , t4 , t5 ;
vector unsigned int sum ;
vector unsigned int sum = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector signed int sumsqr ;
sum = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
for ( i = 0 ; i < h ; i + + ) {
/* Read potentially unaligned pixels into t1 and t2.
* Since we ' re reading 16 pixels , and actually only want 8 ,
@ -386,16 +359,18 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
vector unsigned char pix1r = vec_ld ( 7 , pix1 ) ;
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 7 , pix2 ) ;
t1 = vec_and ( vec_perm ( pix1l , pix1r , perm1 ) , permclear ) ;
t2 = vec_and ( vec_perm ( pix2l , pix2r , perm2 ) , permclear ) ;
vector unsigned char t1 = vec_and ( vec_perm ( pix1l , pix1r , perm1 ) ,
permclear ) ;
vector unsigned char t2 = vec_and ( vec_perm ( pix2l , pix2r , perm2 ) ,
permclear ) ;
/* Since we want to use unsigned chars, we can take advantage
* of the fact that abs ( a - b ) ^ 2 = ( a - b ) ^ 2. */
/* Calculate abs differences vector. */
t3 = vec_max ( t1 , t2 ) ;
t4 = vec_min ( t1 , t2 ) ;
t5 = vec_sub ( t3 , t4 ) ;
vector unsigned char t3 = vec_max ( t1 , t2 ) ;
vector unsigned char t4 = vec_min ( t1 , t2 ) ;
vector unsigned char t5 = vec_sub ( t3 , t4 ) ;
/* Square the values and add them to our sum. */
sum = vec_msum ( t5 , t5 , sum ) ;
@ -417,31 +392,27 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
static int sse16_altivec ( void * v , uint8_t * pix1 , uint8_t * pix2 ,
int line_size , int h )
{
int i ;
int s ;
int i , s ;
const vector unsigned int zero =
( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char perm = vec_lvsl ( 0 , pix2 ) ;
vector unsigned char t1 , t2 , t3 , t4 , t5 ;
vector unsigned int sum ;
vector unsigned int sum = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector signed int sumsqr ;
sum = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
for ( i = 0 ; i < h ; i + + ) {
/* Read potentially unaligned pixels into t1 and t2. */
vector unsigned char pix2l = vec_ld ( 0 , pix2 ) ;
vector unsigned char pix2r = vec_ld ( 15 , pix2 ) ;
t1 = vec_ld ( 0 , pix1 ) ;
t2 = vec_perm ( pix2l , pix2r , perm ) ;
vector unsigned char t1 = vec_ld ( 0 , pix1 ) ;
vector unsigned char t2 = vec_perm ( pix2l , pix2r , perm ) ;
/* Since we want to use unsigned chars, we can take advantage
* of the fact that abs ( a - b ) ^ 2 = ( a - b ) ^ 2. */
/* Calculate abs differences vector. */
t3 = vec_max ( t1 , t2 ) ;
t4 = vec_min ( t1 , t2 ) ;
t5 = vec_sub ( t3 , t4 ) ;
vector unsigned char t3 = vec_max ( t1 , t2 ) ;
vector unsigned char t4 = vec_min ( t1 , t2 ) ;
vector unsigned char t5 = vec_sub ( t3 , t4 ) ;
/* Square the values and add them to our sum. */
sum = vec_msum ( t5 , t5 , sum ) ;
@ -460,23 +431,18 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2,
static int pix_sum_altivec ( uint8_t * pix , int line_size )
{
int i , s ;
const vector unsigned int zero =
( const vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector unsigned char perm = vec_lvsl ( 0 , pix ) ;
vector unsigned char t1 ;
vector unsigned int sad ;
vector unsigned int sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
vector signed int sumdiffs ;
int i ;
int s ;
sad = ( vector unsigned int ) vec_splat_u32 ( 0 ) ;
for ( i = 0 ; i < 16 ; i + + ) {
/* Read the potentially unaligned 16 pixels into t1. */
vector unsigned char pixl = vec_ld ( 0 , pix ) ;
vector unsigned char pixr = vec_ld ( 15 , pix ) ;
t1 = vec_perm ( pixl , pixr , perm ) ;
vector unsigned char t1 = vec_perm ( pixl , pixr , perm ) ;
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s ( t1 , sad ) ;
@ -497,10 +463,8 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
{
int i ;
vector unsigned char perm = vec_lvsl ( 0 , pixels ) ;
vector unsigned char bytes ;
const vector unsigned char zero =
( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
vector signed short shorts ;
for ( i = 0 ; i < 8 ; i + + ) {
/* Read potentially unaligned pixels.
@ -508,10 +472,11 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
* but we simply ignore the extras . */
vector unsigned char pixl = vec_ld ( 0 , pixels ) ;
vector unsigned char pixr = vec_ld ( 7 , pixels ) ;
bytes = vec_perm ( pixl , pixr , perm ) ;
vector unsigned char bytes = vec_perm ( pixl , pixr , perm ) ;
// Convert the bytes into shorts.
shorts = ( vector signed short ) vec_mergeh ( zero , bytes ) ;
vector signed short shorts = ( vector signed short ) vec_mergeh ( zero ,
bytes ) ;
// Save the data to the block, we assume the block is 16-byte aligned.
vec_st ( shorts , i * 16 , ( vector signed short * ) block ) ;
@ -526,7 +491,6 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
int i ;
vector unsigned char perm1 = vec_lvsl ( 0 , s1 ) ;
vector unsigned char perm2 = vec_lvsl ( 0 , s2 ) ;
vector unsigned char bytes , pixl , pixr ;
const vector unsigned char zero =
( const vector unsigned char ) vec_splat_u8 ( 0 ) ;
vector signed short shorts1 , shorts2 ;
@ -535,9 +499,9 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
/* Read potentially unaligned pixels.
* We ' re reading 16 pixels , and actually only want 8 ,
* but we simply ignore the extras . */
pixl = vec_ld ( 0 , s1 ) ;
pixr = vec_ld ( 15 , s1 ) ;
bytes = vec_perm ( pixl , pixr , perm1 ) ;
vector unsigned char pixl = vec_ld ( 0 , s1 ) ;
vector unsigned char pixr = vec_ld ( 15 , s1 ) ;
vector unsigned char bytes = vec_perm ( pixl , pixr , perm1 ) ;
// Convert the bytes into shorts.
shorts1 = ( vector signed short ) vec_mergeh ( zero , bytes ) ;
@ -653,29 +617,31 @@ static int hadamard8_diff8x8_altivec(/* MpegEncContext */ void *s, uint8_t *dst,
# define ONEITERBUTTERFLY(i, res) \
{ \
register vector unsigned char src1 , src2 , srcO ; \
register vector unsigned char dst1 , dst2 , dstO ; \
register vector signed short srcV , dstV ; \
register vector signed short but0 , but1 , but2 , op1 , op2 , op3 ; \
src1 = vec_ld ( stride * i , src ) ; \
src2 = vec_ld ( stride * i + 15 , src ) ; \
srcO = vec_perm ( src1 , src2 , vec_lvsl ( stride * i , src ) ) ; \
dst1 = vec_ld ( stride * i , dst ) ; \
dst2 = vec_ld ( stride * i + 15 , dst ) ; \
dstO = vec_perm ( dst1 , dst2 , vec_lvsl ( stride * i , dst ) ) ; \
register vector unsigned char src1 = vec_ld ( stride * i , src ) ; \
register vector unsigned char src2 = vec_ld ( stride * i + 15 , src ) ; \
register vector unsigned char srcO = \
vec_perm ( src1 , src2 , vec_lvsl ( stride * i , src ) ) ; \
register vector unsigned char dst1 = vec_ld ( stride * i , dst ) ; \
register vector unsigned char dst2 = vec_ld ( stride * i + 15 , dst ) ; \
register vector unsigned char dstO = \
vec_perm ( dst1 , dst2 , vec_lvsl ( stride * i , dst ) ) ; \
\
/* Promote the unsigned chars to signed shorts. */ \
/* We're in the 8x8 function, we only care for the first 8. */ \
srcV = ( vector signed short ) vec_mergeh ( ( vector signed char ) vzero , \
( vector signed char ) srcO ) ; \
dstV = ( vector signed short ) vec_mergeh ( ( vector signed char ) vzero , \
( vector signed char ) dstO ) ; \
register vector signed short srcV = \
( vector signed short ) vec_mergeh ( ( vector signed char ) vzero , \
( vector signed char ) srcO ) ; \
register vector signed short dstV = \
( vector signed short ) vec_mergeh ( ( vector signed char ) vzero , \
( vector signed char ) dstO ) ; \
\
/* subtractions inside the first butterfly */ \
but0 = vec_sub ( srcV , dstV ) ; \
op1 = vec_perm ( but0 , but0 , perm1 ) ; \
but1 = vec_mladd ( but0 , vprod1 , op1 ) ; \
op2 = vec_perm ( but1 , but1 , perm2 ) ; \
but2 = vec_mladd ( but1 , vprod2 , op2 ) ; \
op3 = vec_perm ( but2 , but2 , perm3 ) ; \
register vector signed short but0 = vec_sub ( srcV , dstV ) ; \
register vector signed short op1 = vec_perm ( but0 , but0 , perm1 ) ; \
register vector signed short but1 = vec_mladd ( but0 , vprod1 , op1 ) ; \
register vector signed short op2 = vec_perm ( but1 , but1 , perm2 ) ; \
register vector signed short but2 = vec_mladd ( but1 , vprod2 , op2 ) ; \
register vector signed short op3 = vec_perm ( but2 , but2 , perm3 ) ; \
res = vec_mladd ( but2 , vprod3 , op3 ) ; \
}
ONEITERBUTTERFLY ( 0 , temp0 ) ;
@ -801,62 +767,60 @@ static int hadamard8_diff16x8_altivec(/* MpegEncContext */ void *s, uint8_t *dst
# define ONEITERBUTTERFLY(i, res1, res2) \
{ \
register vector unsigned char \
src1 __asm__ ( " v22 " ) , \
src2 __asm__ ( " v23 " ) , \
dst1 __asm__ ( " v24 " ) , \
dst2 __asm__ ( " v25 " ) , \
srcO __asm__ ( " v22 " ) , \
dstO __asm__ ( " v23 " ) ; \
register vector unsigned char src1 __asm__ ( " v22 " ) = \
vec_ld ( stride * i , src ) ; \
register vector unsigned char src2 __asm__ ( " v23 " ) = \
vec_ld ( stride * i + 16 , src ) ; \
register vector unsigned char srcO __asm__ ( " v22 " ) = \
vec_perm ( src1 , src2 , vec_lvsl ( stride * i , src ) ) ; \
register vector unsigned char dst1 __asm__ ( " v24 " ) = \
vec_ld ( stride * i , dst ) ; \
register vector unsigned char dst2 __asm__ ( " v25 " ) = \
vec_ld ( stride * i + 16 , dst ) ; \
register vector unsigned char dstO __asm__ ( " v23 " ) = \
vec_perm ( dst1 , dst2 , vec_lvsl ( stride * i , dst ) ) ; \
\
register vector signed short \
srcV __asm__ ( " v24 " ) , \
dstV __asm__ ( " v25 " ) , \
srcW __asm__ ( " v26 " ) , \
dstW __asm__ ( " v27 " ) , \
but0 __asm__ ( " v28 " ) , \
but0S __asm__ ( " v29 " ) , \
op1 __asm__ ( " v30 " ) , \
but1 __asm__ ( " v22 " ) , \
op1S __asm__ ( " v23 " ) , \
but1S __asm__ ( " v24 " ) , \
op2 __asm__ ( " v25 " ) , \
but2 __asm__ ( " v26 " ) , \
op2S __asm__ ( " v27 " ) , \
but2S __asm__ ( " v28 " ) , \
op3 __asm__ ( " v29 " ) , \
op3S __asm__ ( " v30 " ) ; \
\
src1 = vec_ld ( stride * i , src ) ; \
src2 = vec_ld ( stride * i + 16 , src ) ; \
srcO = vec_perm ( src1 , src2 , vec_lvsl ( stride * i , src ) ) ; \
dst1 = vec_ld ( stride * i , dst ) ; \
dst2 = vec_ld ( stride * i + 16 , dst ) ; \
dstO = vec_perm ( dst1 , dst2 , vec_lvsl ( stride * i , dst ) ) ; \
/* Promote the unsigned chars to signed shorts. */ \
srcV = ( vector signed short ) vec_mergeh ( ( vector signed char ) vzero , \
( vector signed char ) srcO ) ; \
dstV = ( vector signed short ) vec_mergeh ( ( vector signed char ) vzero , \
( vector signed char ) dstO ) ; \
srcW = ( vector signed short ) vec_mergel ( ( vector signed char ) vzero , \
( vector signed char ) srcO ) ; \
dstW = ( vector signed short ) vec_mergel ( ( vector signed char ) vzero , \
( vector signed char ) dstO ) ; \
register vector signed short srcV __asm__ ( " v24 " ) = \
( vector signed short ) vec_mergeh ( ( vector signed char ) vzero , \
( vector signed char ) srcO ) ; \
register vector signed short dstV __asm__ ( " v25 " ) = \
( vector signed short ) vec_mergeh ( ( vector signed char ) vzero , \
( vector signed char ) dstO ) ; \
register vector signed short srcW __asm__ ( " v26 " ) = \
( vector signed short ) vec_mergel ( ( vector signed char ) vzero , \
( vector signed char ) srcO ) ; \
register vector signed short dstW __asm__ ( " v27 " ) = \
( vector signed short ) vec_mergel ( ( vector signed char ) vzero , \
( vector signed char ) dstO ) ; \
\
/* subtractions inside the first butterfly */ \
but0 = vec_sub ( srcV , dstV ) ; \
but0S = vec_sub ( srcW , dstW ) ; \
op1 = vec_perm ( but0 , but0 , perm1 ) ; \
but1 = vec_mladd ( but0 , vprod1 , op1 ) ; \
op1S = vec_perm ( but0S , but0S , perm1 ) ; \
but1S = vec_mladd ( but0S , vprod1 , op1S ) ; \
op2 = vec_perm ( but1 , but1 , perm2 ) ; \
but2 = vec_mladd ( but1 , vprod2 , op2 ) ; \
op2S = vec_perm ( but1S , but1S , perm2 ) ; \
but2S = vec_mladd ( but1S , vprod2 , op2S ) ; \
op3 = vec_perm ( but2 , but2 , perm3 ) ; \
res1 = vec_mladd ( but2 , vprod3 , op3 ) ; \
op3S = vec_perm ( but2S , but2S , perm3 ) ; \
res2 = vec_mladd ( but2S , vprod3 , op3S ) ; \
register vector signed short but0 __asm__ ( " v28 " ) = \
vec_sub ( srcV , dstV ) ; \
register vector signed short but0S __asm__ ( " v29 " ) = \
vec_sub ( srcW , dstW ) ; \
register vector signed short op1 __asm__ ( " v30 " ) = \
vec_perm ( but0 , but0 , perm1 ) ; \
register vector signed short but1 __asm__ ( " v22 " ) = \
vec_mladd ( but0 , vprod1 , op1 ) ; \
register vector signed short op1S __asm__ ( " v23 " ) = \
vec_perm ( but0S , but0S , perm1 ) ; \
register vector signed short but1S __asm__ ( " v24 " ) = \
vec_mladd ( but0S , vprod1 , op1S ) ; \
register vector signed short op2 __asm__ ( " v25 " ) = \
vec_perm ( but1 , but1 , perm2 ) ; \
register vector signed short but2 __asm__ ( " v26 " ) = \
vec_mladd ( but1 , vprod2 , op2 ) ; \
register vector signed short op2S __asm__ ( " v27 " ) = \
vec_perm ( but1S , but1S , perm2 ) ; \
register vector signed short but2S __asm__ ( " v28 " ) = \
vec_mladd ( but1S , vprod2 , op2S ) ; \
register vector signed short op3 __asm__ ( " v29 " ) = \
vec_perm ( but2 , but2 , perm3 ) ; \
register vector signed short op3S __asm__ ( " v30 " ) = \
vec_perm ( but2S , but2S , perm3 ) ; \
res1 = vec_mladd ( but2 , vprod3 , op3 ) ; \
res2 = vec_mladd ( but2S , vprod3 , op3S ) ; \
}
ONEITERBUTTERFLY ( 0 , temp0 , temp0S ) ;
ONEITERBUTTERFLY ( 1 , temp1 , temp1S ) ;
@ -870,11 +834,6 @@ static int hadamard8_diff16x8_altivec(/* MpegEncContext */ void *s, uint8_t *dst
# undef ONEITERBUTTERFLY
{
register vector signed int vsum ;
register vector signed short line0S , line1S , line2S , line3S , line4S ,
line5S , line6S , line7S , line0BS , line2BS ,
line1BS , line3BS , line4BS , line6BS , line5BS ,
line7BS , line0CS , line4CS , line1CS , line5CS ,
line2CS , line6CS , line3CS , line7CS ;
register vector signed short line0 = vec_add ( temp0 , temp1 ) ;
register vector signed short line1 = vec_sub ( temp0 , temp1 ) ;
@ -903,6 +862,33 @@ static int hadamard8_diff16x8_altivec(/* MpegEncContext */ void *s, uint8_t *dst
register vector signed short line3C = vec_add ( line3B , line7B ) ;
register vector signed short line7C = vec_sub ( line3B , line7B ) ;
register vector signed short line0S = vec_add ( temp0S , temp1S ) ;
register vector signed short line1S = vec_sub ( temp0S , temp1S ) ;
register vector signed short line2S = vec_add ( temp2S , temp3S ) ;
register vector signed short line3S = vec_sub ( temp2S , temp3S ) ;
register vector signed short line4S = vec_add ( temp4S , temp5S ) ;
register vector signed short line5S = vec_sub ( temp4S , temp5S ) ;
register vector signed short line6S = vec_add ( temp6S , temp7S ) ;
register vector signed short line7S = vec_sub ( temp6S , temp7S ) ;
register vector signed short line0BS = vec_add ( line0S , line2S ) ;
register vector signed short line2BS = vec_sub ( line0S , line2S ) ;
register vector signed short line1BS = vec_add ( line1S , line3S ) ;
register vector signed short line3BS = vec_sub ( line1S , line3S ) ;
register vector signed short line4BS = vec_add ( line4S , line6S ) ;
register vector signed short line6BS = vec_sub ( line4S , line6S ) ;
register vector signed short line5BS = vec_add ( line5S , line7S ) ;
register vector signed short line7BS = vec_sub ( line5S , line7S ) ;
register vector signed short line0CS = vec_add ( line0BS , line4BS ) ;
register vector signed short line4CS = vec_sub ( line0BS , line4BS ) ;
register vector signed short line1CS = vec_add ( line1BS , line5BS ) ;
register vector signed short line5CS = vec_sub ( line1BS , line5BS ) ;
register vector signed short line2CS = vec_add ( line2BS , line6BS ) ;
register vector signed short line6CS = vec_sub ( line2BS , line6BS ) ;
register vector signed short line3CS = vec_add ( line3BS , line7BS ) ;
register vector signed short line7CS = vec_sub ( line3BS , line7BS ) ;
vsum = vec_sum4s ( vec_abs ( line0C ) , vec_splat_s32 ( 0 ) ) ;
vsum = vec_sum4s ( vec_abs ( line1C ) , vsum ) ;
vsum = vec_sum4s ( vec_abs ( line2C ) , vsum ) ;
@ -912,33 +898,6 @@ static int hadamard8_diff16x8_altivec(/* MpegEncContext */ void *s, uint8_t *dst
vsum = vec_sum4s ( vec_abs ( line6C ) , vsum ) ;
vsum = vec_sum4s ( vec_abs ( line7C ) , vsum ) ;
line0S = vec_add ( temp0S , temp1S ) ;
line1S = vec_sub ( temp0S , temp1S ) ;
line2S = vec_add ( temp2S , temp3S ) ;
line3S = vec_sub ( temp2S , temp3S ) ;
line4S = vec_add ( temp4S , temp5S ) ;
line5S = vec_sub ( temp4S , temp5S ) ;
line6S = vec_add ( temp6S , temp7S ) ;
line7S = vec_sub ( temp6S , temp7S ) ;
line0BS = vec_add ( line0S , line2S ) ;
line2BS = vec_sub ( line0S , line2S ) ;
line1BS = vec_add ( line1S , line3S ) ;
line3BS = vec_sub ( line1S , line3S ) ;
line4BS = vec_add ( line4S , line6S ) ;
line6BS = vec_sub ( line4S , line6S ) ;
line5BS = vec_add ( line5S , line7S ) ;
line7BS = vec_sub ( line5S , line7S ) ;
line0CS = vec_add ( line0BS , line4BS ) ;
line4CS = vec_sub ( line0BS , line4BS ) ;
line1CS = vec_add ( line1BS , line5BS ) ;
line5CS = vec_sub ( line1BS , line5BS ) ;
line2CS = vec_add ( line2BS , line6BS ) ;
line6CS = vec_sub ( line2BS , line6BS ) ;
line3CS = vec_add ( line3BS , line7BS ) ;
line7CS = vec_sub ( line3BS , line7BS ) ;
vsum = vec_sum4s ( vec_abs ( line0CS ) , vsum ) ;
vsum = vec_sum4s ( vec_abs ( line1CS ) , vsum ) ;
vsum = vec_sum4s ( vec_abs ( line2CS ) , vsum ) ;
@ -957,8 +916,8 @@ static int hadamard8_diff16x8_altivec(/* MpegEncContext */ void *s, uint8_t *dst
static int hadamard8_diff16_altivec ( /* MpegEncContext */ void * s , uint8_t * dst ,
uint8_t * src , int stride , int h )
{
int score ;
score = hadamard8_diff16x8_altivec ( s , dst , src , stride , 8 ) ;
int score = hadamard8_diff16x8_altivec ( s , dst , src , stride , 8 ) ;
if ( h = = 16 ) {
dst + = 8 * stride ;
src + = 8 * stride ;