From 6264b6227c779af9d2520722f6acb45a2c51cdfd Mon Sep 17 00:00:00 2001 From: Tucker DiNapoli Date: Wed, 22 Apr 2015 16:27:26 -0400 Subject: [PATCH] postproc: Replaced inline asm for prefetching with prefetch functions Prefetching functions are defined in postprocess_template using the RENAME macro so that prefetching is used when available. For x86 targets inline asm is used and the functions are non-empty only for cpus where prefetching is available. For non x86 targets the gcc bultin prefetch is used if it is available, otherwise no prefetching is done. Signed-off-by: Michael Niedermayer --- libpostproc/postprocess.c | 31 ------- libpostproc/postprocess_template.c | 126 ++++++++++++++++------------- 2 files changed, 71 insertions(+), 86 deletions(-) diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c index 9d8978204e..af70bb3eeb 100644 --- a/libpostproc/postprocess.c +++ b/libpostproc/postprocess.c @@ -168,37 +168,6 @@ static const char * const replaceTable[]= NULL //End Marker }; - -#if ARCH_X86 && HAVE_INLINE_ASM -static inline void prefetchnta(const void *p) -{ - __asm__ volatile( "prefetchnta (%0)\n\t" - : : "r" (p) - ); -} - -static inline void prefetcht0(const void *p) -{ - __asm__ volatile( "prefetcht0 (%0)\n\t" - : : "r" (p) - ); -} - -static inline void prefetcht1(const void *p) -{ - __asm__ volatile( "prefetcht1 (%0)\n\t" - : : "r" (p) - ); -} - -static inline void prefetcht2(const void *p) -{ - __asm__ volatile( "prefetcht2 (%0)\n\t" - : : "r" (p) - ); -} -#endif - /* The horizontal functions exist only in C because the MMX * code is faster with vertical filters and transposing. */ diff --git a/libpostproc/postprocess_template.c b/libpostproc/postprocess_template.c index 16e441afe9..e153b13408 100644 --- a/libpostproc/postprocess_template.c +++ b/libpostproc/postprocess_template.c @@ -3242,6 +3242,69 @@ static inline void RENAME(duplicate)(uint8_t src[], int stride) #endif } +#if ARCH_X86 && TEMPLATE_PP_MMXEXT +static inline void RENAME(prefetchnta)(const void *p) +{ + __asm__ volatile( "prefetchnta (%0)\n\t" + : : "r" (p) + ); +} + +static inline void RENAME(prefetcht0)(const void *p) +{ + __asm__ volatile( "prefetcht0 (%0)\n\t" + : : "r" (p) + ); +} + +static inline void RENAME(prefetcht1)(const void *p) +{ + __asm__ volatile( "prefetcht1 (%0)\n\t" + : : "r" (p) + ); +} + +static inline void RENAME(prefetcht2)(const void *p) +{ + __asm__ volatile( "prefetcht2 (%0)\n\t" + : : "r" (p) + ); +} +#elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2) +static inline void RENAME(prefetchnta)(const void *p) +{ + __builtin_prefetch(p,0,0); +} +static inline void RENAME(prefetcht0)(const void *p) +{ + __builtin_prefetch(p,0,1); +} +static inline void RENAME(prefetcht1)(const void *p) +{ + __builtin_prefetch(p,0,2); +} +static inline void RENAME(prefetcht2)(const void *p) +{ + __builtin_prefetch(p,0,3); +} +#else +static inline void RENAME(prefetchnta)(const void *p) +{ + return; +} +static inline void RENAME(prefetcht0)(const void *p) +{ + return; +} +static inline void RENAME(prefetcht1)(const void *p) +{ + return; +} +static inline void RENAME(prefetcht2)(const void *p) +{ + return; +} +#endif /** * Filter array of bytes (Y or U or V values) */ @@ -3368,34 +3431,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[ // finish 1 block before the next otherwise we might have a problem // with the L1 Cache of the P4 ... or only a few blocks at a time or something for(x=0; x>2)&6) + 5)*srcStride + 32); - prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); - prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); - prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); -*/ - - __asm__( - "mov %4, %%"REG_a" \n\t" - "shr $2, %%"REG_a" \n\t" - "and $6, %%"REG_a" \n\t" - "add %5, %%"REG_a" \n\t" - "mov %%"REG_a", %%"REG_d" \n\t" - "imul %1, %%"REG_a" \n\t" - "imul %3, %%"REG_d" \n\t" - "prefetchnta 32(%%"REG_a", %0) \n\t" - "prefetcht0 32(%%"REG_d", %2) \n\t" - "add %1, %%"REG_a" \n\t" - "add %3, %%"REG_d" \n\t" - "prefetchnta 32(%%"REG_a", %0) \n\t" - "prefetcht0 32(%%"REG_d", %2) \n\t" - :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), - "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) - : "%"REG_a, "%"REG_d - ); -#endif + RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32); + RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32); + RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32); + RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32); RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); @@ -3474,33 +3513,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[ uint8_t *dstBlockStart = dstBlock; const uint8_t *srcBlockStart = srcBlock; for(; x < endx; x+=BLOCK_SIZE){ -#if TEMPLATE_PP_MMXEXT && HAVE_6REGS -/* - prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); - prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); - prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); - prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); -*/ - - __asm__( - "mov %4, %%"REG_a" \n\t" - "shr $2, %%"REG_a" \n\t" - "and $6, %%"REG_a" \n\t" - "add %5, %%"REG_a" \n\t" - "mov %%"REG_a", %%"REG_d" \n\t" - "imul %1, %%"REG_a" \n\t" - "imul %3, %%"REG_d" \n\t" - "prefetchnta 32(%%"REG_a", %0) \n\t" - "prefetcht0 32(%%"REG_d", %2) \n\t" - "add %1, %%"REG_a" \n\t" - "add %3, %%"REG_d" \n\t" - "prefetchnta 32(%%"REG_a", %0) \n\t" - "prefetcht0 32(%%"REG_d", %2) \n\t" - :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), - "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) - : "%"REG_a, "%"REG_d - ); -#endif + RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32); + RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32); + RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32); + RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32); RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);