|
|
|
@ -268,8 +268,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, |
|
|
|
|
#if ARCH_X86_64 |
|
|
|
|
int i, j, k; |
|
|
|
|
int cpu_flags = av_get_cpu_flags(); |
|
|
|
|
// avx2 hscale filter processes 16 pixel blocks.
|
|
|
|
|
if (!filter || dstW % 16 != 0) |
|
|
|
|
if (!filter) |
|
|
|
|
return 0; |
|
|
|
|
if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { |
|
|
|
|
if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { |
|
|
|
@ -281,9 +280,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, |
|
|
|
|
} |
|
|
|
|
// Do not swap filterPos for pixels which won't be processed by
|
|
|
|
|
// the main loop.
|
|
|
|
|
for (i = 0; i + 8 <= dstW; i += 8) { |
|
|
|
|
for (i = 0; i + 16 <= dstW; i += 16) { |
|
|
|
|
FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); |
|
|
|
|
FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); |
|
|
|
|
FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); |
|
|
|
|
FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); |
|
|
|
|
} |
|
|
|
|
if (filterSize > 4) { |
|
|
|
|
// 16 pixels are processed at a time.
|
|
|
|
@ -297,6 +298,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
// 4 pixels are processed at a time in the tail.
|
|
|
|
|
for (; i < dstW; i += 4) { |
|
|
|
|
// 4 filter coeffs are processed at a time.
|
|
|
|
|
int rem = dstW - i >= 4 ? 4 : dstW - i; |
|
|
|
|
for (k = 0; k + 4 <= filterSize; k += 4) { |
|
|
|
|
for (j = 0; j < rem; ++j) { |
|
|
|
|
int from = (i + j) * filterSize + k; |
|
|
|
|
int to = i * filterSize + j * 4 + k * 4; |
|
|
|
|
memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
av_free(filterCopy); |
|
|
|
|
} |
|
|
|
|