sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,

uint32 array index -> segv), so let's just use a nice plain
unobfuscated version, which also happens to be faster for me.

Originally committed as revision 1470 to svn://svn.ffmpeg.org/ffmpeg/trunk
pull/126/head
Falk Hüffner 22 years ago
parent 1d0eab1dcc
commit 6b02692791
  1. 56
      libavcodec/dsputil.c

@ -191,46 +191,13 @@ static int sse8_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
return s;
}
static int sse16_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
{
int s, i, j;
UINT32 *sq = squareTbl + 256;
int s, i;
uint32_t *sq = squareTbl + 256;
s = 0;
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j += 8) {
#if 1
#if LONG_MAX > 2147483647
uint64_t x,y;
x=*(uint64_t*)pix1;
y=*(uint64_t*)pix2;
s += sq[(x&0xff) - (y&0xff)];
s += sq[((x>>8)&0xff) - ((y>>8)&0xff)];
s += sq[((x>>16)&0xff) - ((y>>16)&0xff)];
s += sq[((x>>24)&0xff) - ((y>>24)&0xff)];
s += sq[((x>>32)&0xff) - ((y>>32)&0xff)];
s += sq[((x>>40)&0xff) - ((y>>40)&0xff)];
s += sq[((x>>48)&0xff) - ((y>>48)&0xff)];
s += sq[((x>>56)&0xff) - ((y>>56)&0xff)];
#else
uint32_t x,y;
x=*(uint32_t*)pix1;
y=*(uint32_t*)pix2;
s += sq[(x&0xff) - (y&0xff)];
s += sq[((x>>8)&0xff) - ((y>>8)&0xff)];
s += sq[((x>>16)&0xff) - ((y>>16)&0xff)];
s += sq[((x>>24)&0xff) - ((y>>24)&0xff)];
x=*(uint32_t*)(pix1+4);
y=*(uint32_t*)(pix2+4);
s += sq[(x&0xff) - (y&0xff)];
s += sq[((x>>8)&0xff) - ((y>>8)&0xff)];
s += sq[((x>>16)&0xff) - ((y>>16)&0xff)];
s += sq[((x>>24)&0xff) - ((y>>24)&0xff)];
#endif
#else
s += sq[pix1[ 0] - pix2[ 0]];
s += sq[pix1[ 1] - pix2[ 1]];
s += sq[pix1[ 2] - pix2[ 2]];
@ -239,12 +206,17 @@ static int sse16_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
s += sq[pix1[ 5] - pix2[ 5]];
s += sq[pix1[ 6] - pix2[ 6]];
s += sq[pix1[ 7] - pix2[ 7]];
#endif
pix1 += 8;
pix2 += 8;
}
pix1 += line_size - 16;
pix2 += line_size - 16;
s += sq[pix1[ 8] - pix2[ 8]];
s += sq[pix1[ 9] - pix2[ 9]];
s += sq[pix1[10] - pix2[10]];
s += sq[pix1[11] - pix2[11]];
s += sq[pix1[12] - pix2[12]];
s += sq[pix1[13] - pix2[13]];
s += sq[pix1[14] - pix2[14]];
s += sq[pix1[15] - pix2[15]];
pix1 += line_size;
pix2 += line_size;
}
return s;
}

Loading…
Cancel
Save