|
|
|
@ -349,110 +349,6 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
|
|
|
|
} |
|
|
|
|
#endif //HAVE_ALTIVEC
|
|
|
|
|
|
|
|
|
|
#if 0 |
|
|
|
|
/**
|
|
|
|
|
* Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar |
|
|
|
|
* values are correctly clipped (MMX2) |
|
|
|
|
* values are wraparound (C) |
|
|
|
|
* Conclusion: It is fast, but introduces ugly horizontal patterns |
|
|
|
|
* if there is a continuous gradient. |
|
|
|
|
0 8 16 24 |
|
|
|
|
x = 8 |
|
|
|
|
x/2 = 4 |
|
|
|
|
x/8 = 1 |
|
|
|
|
1 12 12 23 |
|
|
|
|
*/ |
|
|
|
|
static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
|
|
|
|
{ |
|
|
|
|
#if HAVE_MMX2 || HAVE_AMD3DNOW |
|
|
|
|
src+= stride*3; |
|
|
|
|
// FIXME rounding
|
|
|
|
|
__asm__ volatile( |
|
|
|
|
"pxor %%mm7, %%mm7 \n\t" // 0
|
|
|
|
|
"movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
|
|
|
|
|
"leal (%0, %1), %%"REG_a" \n\t" |
|
|
|
|
"leal (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
|
|
|
|
// 0 1 2 3 4 5 6 7 8 9
|
|
|
|
|
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
|
|
|
|
|
"movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP
|
|
|
|
|
"movq %%mm0, %%mm1 \n\t" // QP,..., QP
|
|
|
|
|
"paddusb "MANGLE(b02)", %%mm0 \n\t" |
|
|
|
|
"psrlw $2, %%mm0 \n\t" |
|
|
|
|
"pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4
|
|
|
|
|
"paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
|
|
|
|
|
"movq (%0, %1, 4), %%mm2 \n\t" // line 4
|
|
|
|
|
"movq (%%"REG_c"), %%mm3 \n\t" // line 5
|
|
|
|
|
"movq %%mm2, %%mm4 \n\t" // line 4
|
|
|
|
|
"pcmpeqb %%mm5, %%mm5 \n\t" // -1
|
|
|
|
|
"pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
|
|
|
|
|
PAVGB(%%mm3, %%mm5) |
|
|
|
|
"paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
|
|
|
|
|
"psubusb %%mm3, %%mm4 \n\t" |
|
|
|
|
"psubusb %%mm2, %%mm3 \n\t" |
|
|
|
|
"por %%mm3, %%mm4 \n\t" // |l4 - l5|
|
|
|
|
|
"psubusb %%mm0, %%mm4 \n\t" |
|
|
|
|
"pcmpeqb %%mm7, %%mm4 \n\t" |
|
|
|
|
"pand %%mm4, %%mm5 \n\t" // d/2
|
|
|
|
|
|
|
|
|
|
// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
|
|
|
|
|
"paddb %%mm5, %%mm2 \n\t" |
|
|
|
|
// "psubb %%mm6, %%mm2 \n\t"
|
|
|
|
|
"movq %%mm2, (%0,%1, 4) \n\t" |
|
|
|
|
|
|
|
|
|
"movq (%%"REG_c"), %%mm2 \n\t" |
|
|
|
|
// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
|
|
|
|
|
"psubb %%mm5, %%mm2 \n\t" |
|
|
|
|
// "psubb %%mm6, %%mm2 \n\t"
|
|
|
|
|
"movq %%mm2, (%%"REG_c") \n\t" |
|
|
|
|
|
|
|
|
|
"paddb %%mm6, %%mm5 \n\t" |
|
|
|
|
"psrlw $2, %%mm5 \n\t" |
|
|
|
|
"pand "MANGLE(b3F)", %%mm5 \n\t" |
|
|
|
|
"psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8
|
|
|
|
|
|
|
|
|
|
"movq (%%"REG_a", %1, 2), %%mm2 \n\t" |
|
|
|
|
"paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
|
|
|
|
|
"paddsb %%mm5, %%mm2 \n\t" |
|
|
|
|
"psubb %%mm6, %%mm2 \n\t" |
|
|
|
|
"movq %%mm2, (%%"REG_a", %1, 2) \n\t" |
|
|
|
|
|
|
|
|
|
"movq (%%"REG_c", %1), %%mm2 \n\t" |
|
|
|
|
"paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
|
|
|
|
|
"psubsb %%mm5, %%mm2 \n\t" |
|
|
|
|
"psubb %%mm6, %%mm2 \n\t" |
|
|
|
|
"movq %%mm2, (%%"REG_c", %1) \n\t" |
|
|
|
|
|
|
|
|
|
: |
|
|
|
|
: "r" (src), "r" ((x86_reg)stride) |
|
|
|
|
: "%"REG_a, "%"REG_c |
|
|
|
|
); |
|
|
|
|
#else //HAVE_MMX2 || HAVE_AMD3DNOW
|
|
|
|
|
const int l1= stride; |
|
|
|
|
const int l2= stride + l1; |
|
|
|
|
const int l3= stride + l2; |
|
|
|
|
const int l4= stride + l3; |
|
|
|
|
const int l5= stride + l4; |
|
|
|
|
const int l6= stride + l5; |
|
|
|
|
// const int l7= stride + l6;
|
|
|
|
|
// const int l8= stride + l7;
|
|
|
|
|
// const int l9= stride + l8;
|
|
|
|
|
int x; |
|
|
|
|
const int QP15= QP + (QP>>2); |
|
|
|
|
src+= stride*3; |
|
|
|
|
for(x=0; x<BLOCK_SIZE; x++){ |
|
|
|
|
const int v = (src[x+l5] - src[x+l4]); |
|
|
|
|
if(FFABS(v) < QP15){ |
|
|
|
|
src[x+l3] +=v>>3; |
|
|
|
|
src[x+l4] +=v>>1; |
|
|
|
|
src[x+l5] -=v>>1; |
|
|
|
|
src[x+l6] -=v>>3; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#endif //HAVE_MMX2 || HAVE_AMD3DNOW
|
|
|
|
|
} |
|
|
|
|
#endif //0
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Experimental Filter 1 |
|
|
|
|
* will not damage linear gradients |
|
|
|
|