mirror of https://github.com/FFmpeg/FFmpeg.git
- ff_pix_abs16_neon - ff_pix_abs16_xy2_neon In direct micro benchmarks of these ff functions verses their C implementations, these functions performed as follows on AWS Graviton 3. ff_pix_abs16_neon: pix_abs_0_0_c: 141.1 pix_abs_0_0_neon: 19.6 ff_pix_abs16_xy2_neon: pix_abs_0_3_c: 269.1 pix_abs_0_3_neon: 39.3 Tested with: ./tests/checkasm/checkasm --test=motion --bench --disable-linux-perf Signed-off-by: Jonathan Swinney <jswinney@amazon.com> Signed-off-by: Martin Storsjö <martin@martin.st>release/5.1
parent
20e2aa940c
commit
c471cc7474
10 changed files with 407 additions and 1 deletions
@ -0,0 +1,39 @@ |
|||||||
|
/*
|
||||||
|
* This file is part of FFmpeg. |
||||||
|
* |
||||||
|
* FFmpeg is free software; you can redistribute it and/or |
||||||
|
* modify it under the terms of the GNU Lesser General Public |
||||||
|
* License as published by the Free Software Foundation; either |
||||||
|
* version 2.1 of the License, or (at your option) any later version. |
||||||
|
* |
||||||
|
* FFmpeg is distributed in the hope that it will be useful, |
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
* Lesser General Public License for more details. |
||||||
|
* |
||||||
|
* You should have received a copy of the GNU Lesser General Public |
||||||
|
* License along with FFmpeg; if not, write to the Free Software |
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
*/ |
||||||
|
|
||||||
|
#include <stdint.h> |
||||||
|
|
||||||
|
#include "config.h" |
||||||
|
#include "libavutil/attributes.h" |
||||||
|
#include "libavutil/aarch64/cpu.h" |
||||||
|
#include "libavcodec/mpegvideo.h" |
||||||
|
|
||||||
|
int ff_pix_abs16_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
||||||
|
ptrdiff_t stride, int h); |
||||||
|
int ff_pix_abs16_xy2_neon(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, |
||||||
|
ptrdiff_t stride, int h); |
||||||
|
|
||||||
|
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx) |
||||||
|
{ |
||||||
|
int cpu_flags = av_get_cpu_flags(); |
||||||
|
|
||||||
|
if (have_neon(cpu_flags)) { |
||||||
|
c->pix_abs[0][0] = ff_pix_abs16_neon; |
||||||
|
c->pix_abs[0][3] = ff_pix_abs16_xy2_neon; |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,205 @@ |
|||||||
|
/* |
||||||
|
* Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
|
||||||
|
* |
||||||
|
* This file is part of FFmpeg. |
||||||
|
* |
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public |
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version. |
||||||
|
* |
||||||
|
* FFmpeg is distributed in the hope that it will be useful, |
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
* Lesser General Public License for more details. |
||||||
|
* |
||||||
|
* You should have received a copy of the GNU Lesser General Public |
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "libavutil/aarch64/asm.S" |
||||||
|
|
||||||
|
function ff_pix_abs16_neon, export=1 |
||||||
|
// x0 unused |
||||||
|
// x1 uint8_t *pix1 |
||||||
|
// x2 uint8_t *pix2 |
||||||
|
// x3 ptrdiff_t stride |
||||||
|
// w4 int h |
||||||
|
cmp w4, #4 // if h < 4, jump to completion section |
||||||
|
movi v18.4S, #0 // clear result accumulator |
||||||
|
b.lt 2f |
||||||
|
1: |
||||||
|
ld1 {v0.16b}, [x1], x3 // load pix1 |
||||||
|
ld1 {v4.16b}, [x2], x3 // load pix2 |
||||||
|
ld1 {v1.16b}, [x1], x3 // load pix1 |
||||||
|
ld1 {v5.16b}, [x2], x3 // load pix2 |
||||||
|
uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate |
||||||
|
uabdl2 v17.8h, v0.16b, v4.16b |
||||||
|
ld1 {v2.16b}, [x1], x3 // load pix1 |
||||||
|
ld1 {v6.16b}, [x2], x3 // load pix2 |
||||||
|
uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate |
||||||
|
uabal2 v17.8h, v1.16b, v5.16b |
||||||
|
ld1 {v3.16b}, [x1], x3 |
||||||
|
ld1 {v7.16b}, [x2], x3 |
||||||
|
uabal v16.8h, v2.8b, v6.8b |
||||||
|
uabal2 v17.8h, v2.16b, v6.16b |
||||||
|
sub w4, w4, #4 // h -= 4 |
||||||
|
uabal v16.8h, v3.8b, v7.8b |
||||||
|
uabal2 v17.8h, v3.16b, v7.16b |
||||||
|
cmp w4, #4 // if h >= 4, loop |
||||||
|
add v16.8h, v16.8h, v17.8h |
||||||
|
uaddlv s16, v16.8h // add up everything in v16 accumulator |
||||||
|
add d18, d16, d18 // add to the end result register |
||||||
|
|
||||||
|
b.ge 1b |
||||||
|
cbnz w4, 2f // if iterations remain, jump to completion section |
||||||
|
|
||||||
|
fmov w0, s18 // copy result to general purpose register |
||||||
|
ret |
||||||
|
|
||||||
|
2: |
||||||
|
ld1 {v0.16b}, [x1], x3 // load pix1 |
||||||
|
ld1 {v4.16b}, [x2], x3 // load pix2 |
||||||
|
uabdl v16.8h, v0.8b, v4.8b // absolute difference accumulate |
||||||
|
uabal2 v16.8h, v0.16b, v4.16b |
||||||
|
subs w4, w4, #1 // h -= 1 |
||||||
|
addv h16, v16.8h // add up v16 |
||||||
|
add d18, d16, d18 // add to result |
||||||
|
b.ne 2b |
||||||
|
|
||||||
|
fmov w0, s18 // copy result to general purpose register |
||||||
|
ret |
||||||
|
endfunc |
||||||
|
|
||||||
|
function ff_pix_abs16_xy2_neon, export=1 |
||||||
|
// x0 unused |
||||||
|
// x1 uint8_t *pix1 |
||||||
|
// x2 uint8_t *pix2 |
||||||
|
// x3 ptrdiff_t stride |
||||||
|
// w4 int h |
||||||
|
|
||||||
|
add x5, x2, x3 // use x5 to hold uint8_t *pix3 |
||||||
|
movi v0.2d, #0 // initialize the result register |
||||||
|
|
||||||
|
// Load initial pix2 values for either the unrolled version or completion version. |
||||||
|
ldur q4, [x2, #1] // load pix2+1 |
||||||
|
ldr q3, [x2] // load pix2 |
||||||
|
uaddl v2.8h, v4.8b, v3.8b // pix2 + pix2+1 0..7 |
||||||
|
uaddl2 v3.8h, v4.16b, v3.16b // pix2 + pix2+1 8..15 |
||||||
|
cmp w4, #4 // if h < 4 jump to the completion version |
||||||
|
b.lt 2f |
||||||
|
1: |
||||||
|
// This is an unrolled implementation. It completes 4 iterations of the C for each branch. |
||||||
|
// In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration, |
||||||
|
// plus two at the beginning to start. |
||||||
|
ldur q5, [x5, #1] // load pix3+1 |
||||||
|
ld1 {v4.16b}, [x5], x3 // load pix3 |
||||||
|
ld1 {v1.16b}, [x1], x3 // load pix1 |
||||||
|
|
||||||
|
ldur q7, [x5, #1] // load pix3+1 |
||||||
|
ld1 {v6.16b}, [x5], x3 // load pix3 |
||||||
|
ld1 {v16.16b}, [x1], x3 // load pix1 |
||||||
|
|
||||||
|
ldur q19, [x5, #1] // load pix3+1 |
||||||
|
ld1 {v18.16b}, [x5], x3 // load pix3 |
||||||
|
ld1 {v17.16b}, [x1], x3 // load pix1 |
||||||
|
|
||||||
|
ldur q22, [x5, #1] // load pix3+1 |
||||||
|
ld1 {v21.16b}, [x5], x3 // load pix3 |
||||||
|
ld1 {v20.16b}, [x1], x3 // load pix1 |
||||||
|
|
||||||
|
// These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1]) |
||||||
|
uaddl v30.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7 |
||||||
|
uaddl2 v31.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15 |
||||||
|
add v23.8h, v2.8h, v30.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration |
||||||
|
add v24.8h, v3.8h, v31.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration |
||||||
|
rshrn v23.8b, v23.8h, #2 // shift right 2 0..7 (rounding shift right) |
||||||
|
rshrn2 v23.16b, v24.8h, #2 // shift right 2 8..15 |
||||||
|
|
||||||
|
uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7 |
||||||
|
uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15 |
||||||
|
add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above |
||||||
|
add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above |
||||||
|
rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right) |
||||||
|
rshrn2 v26.16b, v27.8h, #2 // shift right 2 8..15 |
||||||
|
|
||||||
|
uaddl v4.8h, v18.8b, v19.8b // pix3 + pix3+1 0..7 |
||||||
|
uaddl2 v5.8h, v18.16b, v19.16b // pix3 + pix3+1 8..15 |
||||||
|
add v28.8h, v2.8h, v4.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above |
||||||
|
add v29.8h, v3.8h, v5.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above |
||||||
|
rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right) |
||||||
|
rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15 |
||||||
|
|
||||||
|
uaddl v2.8h, v21.8b, v22.8b // pix3 + pix3+1 0..7 |
||||||
|
uaddl2 v3.8h, v21.16b, v22.16b // pix3 + pix3+1 8..15 |
||||||
|
add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above |
||||||
|
add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above |
||||||
|
rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right) |
||||||
|
rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15 |
||||||
|
|
||||||
|
// Averages are now stored in these registers: |
||||||
|
// v23, v16, v28, v30 |
||||||
|
// pix1 values in these registers: |
||||||
|
// v1, v16, v17, v20 |
||||||
|
// available: |
||||||
|
// v4, v5, v7, v18, v19, v24, v25, v27, v29, v31 |
||||||
|
|
||||||
|
sub w4, w4, #4 // h -= 4 |
||||||
|
|
||||||
|
// Using absolute-difference instructions instead of absolute-difference-accumulate allows |
||||||
|
// us to keep the results in 16b vectors instead of widening values with twice the instructions. |
||||||
|
// This approach also has fewer data dependencies, allowing better instruction level parallelism. |
||||||
|
uabd v4.16b, v1.16b, v23.16b // absolute difference 0..15, i=0 |
||||||
|
uabd v5.16b, v16.16b, v26.16b // absolute difference 0..15, i=1 |
||||||
|
uabd v6.16b, v17.16b, v28.16b // absolute difference 0..15, i=2 |
||||||
|
uabd v7.16b, v20.16b, v30.16b // absolute difference 0..15, i=3 |
||||||
|
|
||||||
|
cmp w4, #4 // loop if h >= 4 |
||||||
|
|
||||||
|
// Now add up all the values in each vector, v4-v7 with widening adds |
||||||
|
uaddl v19.8h, v4.8b, v5.8b |
||||||
|
uaddl2 v18.8h, v4.16b, v5.16b |
||||||
|
uaddl v4.8h, v6.8b, v7.8b |
||||||
|
uaddl2 v5.8h, v6.16b, v7.16b |
||||||
|
add v4.8h, v4.8h, v5.8h |
||||||
|
add v4.8h, v4.8h, v18.8h |
||||||
|
add v4.8h, v4.8h, v19.8h |
||||||
|
uaddlv s4, v4.8h // finish adding up accumulated values |
||||||
|
add d0, d0, d4 // add the value to the top level accumulator |
||||||
|
|
||||||
|
b.ge 1b |
||||||
|
cbnz w4, 2f // if iterations remain jump to completion section |
||||||
|
|
||||||
|
fmov w0, s0 // copy result to general purpose register |
||||||
|
ret |
||||||
|
2: |
||||||
|
// v2 and v3 are set either at the end of this loop or at from the unrolled version |
||||||
|
// which branches here to complete iterations when h % 4 != 0. |
||||||
|
ldur q5, [x5, #1] // load pix3+1 |
||||||
|
ld1 {v4.16b}, [x5], x3 // load pix3 |
||||||
|
ld1 {v1.16b}, [x1], x3 // load pix1 |
||||||
|
subs w4, w4, #1 // decrement h |
||||||
|
|
||||||
|
uaddl v18.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7 |
||||||
|
uaddl2 v19.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15 |
||||||
|
add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration |
||||||
|
add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration |
||||||
|
// divide by 4 to compute the average of values summed above |
||||||
|
urshr v16.8h, v16.8h, #2 // shift right by 2 0..7 (rounding shift right) |
||||||
|
urshr v17.8h, v17.8h, #2 // shift right by 2 8..15 |
||||||
|
|
||||||
|
uxtl2 v8.8h, v1.16b // 8->16 bits pix1 8..15 |
||||||
|
uxtl v1.8h, v1.8b // 8->16 bits pix1 0..7 |
||||||
|
|
||||||
|
uabd v6.8h, v1.8h, v16.8h // absolute difference 0..7 |
||||||
|
uaba v6.8h, v8.8h, v17.8h // absolute difference accumulate 8..15 |
||||||
|
mov v2.16b, v18.16b // pix3 -> pix2 |
||||||
|
mov v3.16b, v19.16b // pix3+1 -> pix2+1 |
||||||
|
uaddlv s6, v6.8h // add up accumulator in v6 |
||||||
|
add d0, d0, d6 // add to the final result |
||||||
|
|
||||||
|
b.ne 2b // loop if h > 0 |
||||||
|
fmov w0, s0 // copy result to general purpose register |
||||||
|
ret |
||||||
|
endfunc |
@ -0,0 +1,151 @@ |
|||||||
|
/*
|
||||||
|
* |
||||||
|
* This file is part of FFmpeg. |
||||||
|
* |
||||||
|
* FFmpeg is free software; you can redistribute it and/or modify |
||||||
|
* it under the terms of the GNU General Public License as published by |
||||||
|
* the Free Software Foundation; either version 2 of the License, or |
||||||
|
* (at your option) any later version. |
||||||
|
* |
||||||
|
* FFmpeg is distributed in the hope that it will be useful, |
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
|
* GNU General Public License for more details. |
||||||
|
* |
||||||
|
* You should have received a copy of the GNU General Public License along |
||||||
|
* with FFmpeg; if not, write to the Free Software Foundation, Inc., |
||||||
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
||||||
|
*/ |
||||||
|
|
||||||
|
#include <string.h> |
||||||
|
|
||||||
|
#include "libavutil/common.h" |
||||||
|
#include "libavutil/intreadwrite.h" |
||||||
|
#include "libavutil/mem_internal.h" |
||||||
|
|
||||||
|
#include "libavcodec/me_cmp.h" |
||||||
|
|
||||||
|
#include "checkasm.h" |
||||||
|
|
||||||
|
static void fill_random(uint8_t *tab, int size) |
||||||
|
{ |
||||||
|
int i; |
||||||
|
for (i = 0; i < size; i++) { |
||||||
|
tab[i] = rnd() % 256; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
static void test_motion(const char *name, me_cmp_func test_func) |
||||||
|
{ |
||||||
|
/* test configurarion */ |
||||||
|
#define ITERATIONS 16 |
||||||
|
#define WIDTH 64 |
||||||
|
#define HEIGHT 64 |
||||||
|
|
||||||
|
/* motion estimation can look up to 17 bytes ahead */ |
||||||
|
static const int look_ahead = 17; |
||||||
|
|
||||||
|
int i, x, y, d1, d2; |
||||||
|
uint8_t *ptr; |
||||||
|
|
||||||
|
LOCAL_ALIGNED_8(uint8_t, img1, [WIDTH * HEIGHT]); |
||||||
|
LOCAL_ALIGNED_8(uint8_t, img2, [WIDTH * HEIGHT]); |
||||||
|
|
||||||
|
declare_func_emms(AV_CPU_FLAG_MMX, int, struct MpegEncContext *c, |
||||||
|
uint8_t *blk1 /* align width (8 or 16) */, |
||||||
|
uint8_t *blk2 /* align 1 */, ptrdiff_t stride, |
||||||
|
int h); |
||||||
|
|
||||||
|
if (test_func == NULL) { |
||||||
|
return; |
||||||
|
} |
||||||
|
|
||||||
|
/* test correctness */ |
||||||
|
fill_random(img1, WIDTH * HEIGHT); |
||||||
|
fill_random(img2, WIDTH * HEIGHT); |
||||||
|
|
||||||
|
if (check_func(test_func, "%s", name)) { |
||||||
|
for (i = 0; i < ITERATIONS; i++) { |
||||||
|
x = rnd() % (WIDTH - look_ahead); |
||||||
|
y = rnd() % (HEIGHT - look_ahead); |
||||||
|
|
||||||
|
ptr = img2 + y * WIDTH + x; |
||||||
|
d2 = call_ref(NULL, img1, ptr, WIDTH, 8); |
||||||
|
d1 = call_new(NULL, img1, ptr, WIDTH, 8); |
||||||
|
|
||||||
|
if (d1 != d2) { |
||||||
|
fail(); |
||||||
|
printf("func: %s, x=%d y=%d, error: asm=%d c=%d\n", name, x, y, d1, d2); |
||||||
|
break; |
||||||
|
} |
||||||
|
} |
||||||
|
// benchmark with the final value of ptr
|
||||||
|
bench_new(NULL, img1, ptr, WIDTH, 8); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
#define ME_CMP_1D_ARRAYS(XX) \ |
||||||
|
XX(sad) \
|
||||||
|
XX(sse) \
|
||||||
|
XX(hadamard8_diff) \
|
||||||
|
XX(vsad) \
|
||||||
|
XX(vsse) \
|
||||||
|
XX(nsse) \
|
||||||
|
XX(me_pre_cmp) \
|
||||||
|
XX(me_cmp) \
|
||||||
|
XX(me_sub_cmp) \
|
||||||
|
XX(mb_cmp) \
|
||||||
|
XX(ildct_cmp) \
|
||||||
|
XX(frame_skip_cmp) \
|
||||||
|
XX(median_sad) |
||||||
|
|
||||||
|
// tests for functions not yet implemented
|
||||||
|
#if 0 |
||||||
|
XX(dct_sad) \
|
||||||
|
XX(quant_psnr) \
|
||||||
|
XX(bit) \
|
||||||
|
XX(rd) \
|
||||||
|
XX(w53) \
|
||||||
|
XX(w97) \
|
||||||
|
XX(dct_max) \
|
||||||
|
XX(dct264_sad) \
|
||||||
|
|
||||||
|
#endif |
||||||
|
|
||||||
|
static void check_motion(void) |
||||||
|
{ |
||||||
|
char buf[64]; |
||||||
|
AVCodecContext *av_ctx; |
||||||
|
MECmpContext me_ctx; |
||||||
|
|
||||||
|
memset(&me_ctx, 0, sizeof(me_ctx)); |
||||||
|
|
||||||
|
/* allocate AVCodecContext */ |
||||||
|
av_ctx = avcodec_alloc_context3(NULL); |
||||||
|
av_ctx->flags |= AV_CODEC_FLAG_BITEXACT; |
||||||
|
|
||||||
|
ff_me_cmp_init(&me_ctx, av_ctx); |
||||||
|
|
||||||
|
for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.pix_abs); i++) { |
||||||
|
for (int j = 0; j < FF_ARRAY_ELEMS(me_ctx.pix_abs[0]); j++) { |
||||||
|
snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j); |
||||||
|
test_motion(buf, me_ctx.pix_abs[i][j]); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
#define XX(me_cmp_array) \ |
||||||
|
for (int i = 0; i < FF_ARRAY_ELEMS(me_ctx.me_cmp_array); i++) { \
|
||||||
|
snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i); \
|
||||||
|
test_motion(buf, me_ctx.me_cmp_array[i]); \
|
||||||
|
} |
||||||
|
ME_CMP_1D_ARRAYS(XX) |
||||||
|
#undef XX |
||||||
|
|
||||||
|
avcodec_free_context(&av_ctx); |
||||||
|
} |
||||||
|
|
||||||
|
void checkasm_check_motion(void) |
||||||
|
{ |
||||||
|
check_motion(); |
||||||
|
report("motion"); |
||||||
|
} |
Loading…
Reference in new issue