|
|
|
@ -630,9 +630,9 @@ static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i |
|
|
|
|
dst += 4-8*dstStride;\
|
|
|
|
|
}\
|
|
|
|
|
}\
|
|
|
|
|
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
|
|
|
|
int h=8;\
|
|
|
|
|
int w=4;\
|
|
|
|
|
static inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
|
|
|
|
|
int h = size;\
|
|
|
|
|
int w = (size+8)>>2;\
|
|
|
|
|
src -= 2*srcStride+2;\
|
|
|
|
|
while(w--){\
|
|
|
|
|
asm volatile(\
|
|
|
|
@ -652,23 +652,40 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, |
|
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t"\
|
|
|
|
|
"punpcklbw %%mm7, %%mm3 \n\t"\
|
|
|
|
|
"punpcklbw %%mm7, %%mm4 \n\t"\
|
|
|
|
|
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\
|
|
|
|
|
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\
|
|
|
|
|
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\
|
|
|
|
|
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\
|
|
|
|
|
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\
|
|
|
|
|
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\
|
|
|
|
|
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\
|
|
|
|
|
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
|
|
|
|
|
\
|
|
|
|
|
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
|
|
|
|
|
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
|
|
|
|
|
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
|
|
|
|
|
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
|
|
|
|
|
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
|
|
|
|
|
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
|
|
|
|
|
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
|
|
|
|
|
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
|
|
|
|
|
: "+a"(src)\
|
|
|
|
|
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
|
|
|
|
|
: "memory"\
|
|
|
|
|
);\
|
|
|
|
|
if(size==16){\
|
|
|
|
|
asm volatile(\
|
|
|
|
|
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
|
|
|
|
|
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
|
|
|
|
|
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
|
|
|
|
|
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
|
|
|
|
|
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
|
|
|
|
|
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
|
|
|
|
|
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
|
|
|
|
|
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
|
|
|
|
|
: "+a"(src)\
|
|
|
|
|
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
|
|
|
|
|
: "memory"\
|
|
|
|
|
);\
|
|
|
|
|
}\
|
|
|
|
|
tmp += 4;\
|
|
|
|
|
src += 4 - 13*srcStride;\
|
|
|
|
|
src += 4 - (size+5)*srcStride;\
|
|
|
|
|
}\
|
|
|
|
|
tmp -= 4*4;\
|
|
|
|
|
tmp -= size+8;\
|
|
|
|
|
w = size>>4;\
|
|
|
|
|
do{\
|
|
|
|
|
h = size;\
|
|
|
|
|
asm volatile(\
|
|
|
|
|
"movq %4, %%mm6 \n\t"\
|
|
|
|
|
"1: \n\t"\
|
|
|
|
@ -702,7 +719,7 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, |
|
|
|
|
"psraw $6, %%mm3 \n\t"\
|
|
|
|
|
"packuswb %%mm3, %%mm0 \n\t"\
|
|
|
|
|
OP(%%mm0, (%1),%%mm7, q)\
|
|
|
|
|
"add $32, %0 \n\t"\
|
|
|
|
|
"add $48, %0 \n\t"\
|
|
|
|
|
"add %3, %1 \n\t"\
|
|
|
|
|
"decl %2 \n\t"\
|
|
|
|
|
" jnz 1b \n\t"\
|
|
|
|
@ -710,6 +727,9 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, |
|
|
|
|
: "S"((long)dstStride), "m"(ff_pw_32)\
|
|
|
|
|
: "memory"\
|
|
|
|
|
);\
|
|
|
|
|
tmp += 8 - size*24;\
|
|
|
|
|
dst += 8 - size*dstStride;\
|
|
|
|
|
}while(w--);\
|
|
|
|
|
}\
|
|
|
|
|
static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
|
|
|
|
OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
|
|
|
@ -729,13 +749,12 @@ static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, |
|
|
|
|
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
|
|
|
|
}\
|
|
|
|
|
\
|
|
|
|
|
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
|
|
|
|
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
|
|
|
|
|
}\
|
|
|
|
|
\
|
|
|
|
|
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
|
|
|
|
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
|
|
|
|
|
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
|
|
|
|
|
src += 8*srcStride;\
|
|
|
|
|
dst += 8*dstStride;\
|
|
|
|
|
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
|
|
|
|
|
OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
|
|
|
|
|
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
|
|
|
|
|
}\
|
|
|
|
|
|
|
|
|
|
#define H264_MC(OPNAME, SIZE, MMX) \ |
|
|
|
@ -816,13 +835,13 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t * |
|
|
|
|
}\
|
|
|
|
|
\
|
|
|
|
|
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE+8)/4];\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE<8?12:24)/4];\
|
|
|
|
|
int16_t * const tmp= (int16_t*)temp;\
|
|
|
|
|
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
|
|
|
|
|
}\
|
|
|
|
|
\
|
|
|
|
|
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
|
|
|
|
|
uint8_t * const halfH= (uint8_t*)temp;\
|
|
|
|
|
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
|
|
|
|
|
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
|
|
|
|
@ -832,7 +851,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t * |
|
|
|
|
}\
|
|
|
|
|
\
|
|
|
|
|
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
|
|
|
|
|
uint8_t * const halfH= (uint8_t*)temp;\
|
|
|
|
|
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
|
|
|
|
|
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
|
|
|
|
@ -842,7 +861,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t * |
|
|
|
|
}\
|
|
|
|
|
\
|
|
|
|
|
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
|
|
|
|
|
uint8_t * const halfV= (uint8_t*)temp;\
|
|
|
|
|
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
|
|
|
|
|
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
|
|
|
|
@ -852,7 +871,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t * |
|
|
|
|
}\
|
|
|
|
|
\
|
|
|
|
|
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
|
|
|
|
|
uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
|
|
|
|
|
uint8_t * const halfV= (uint8_t*)temp;\
|
|
|
|
|
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
|
|
|
|
|
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
|
|
|
|
|