aarch64: vp8: Port vp8_luma_dc_wht and vp8_idct_dc_add4uv from arm version

Cortex A53    A72    A73
vp8_luma_dc_wht_c:        115.7   75.7   90.7
vp8_luma_dc_wht_neon:      60.7   41.2   45.7
vp8_idct_dc_add4uv_c:     376.1  262.9  282.5
vp8_idct_dc_add4uv_neon:   52.0   29.0   37.0

Signed-off-by: Martin Storsjö <martin@martin.st>
pull/310/head
Martin Storsjö 6 years ago
parent c513fcd7d2
commit 52c9b0a6c0
  1. 3
      libavcodec/aarch64/vp8dsp_init_aarch64.c
  2. 109
      libavcodec/aarch64/vp8dsp_neon.S

@ -28,6 +28,7 @@ void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
VP8_LF(neon);
@ -55,10 +56,12 @@ av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp)
{
if (!have_neon(av_get_cpu_flags()))
return;
dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
dsp->vp8_idct_add = ff_vp8_idct_add_neon;
dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;

@ -4,6 +4,7 @@
* Copyright (c) 2010 Rob Clark <rob@ti.com>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
* Copyright (c) 2019 Martin Storsjo <martin@martin.st>
*
* This file is part of Libav.
*
@ -25,6 +26,62 @@
#include "libavutil/aarch64/asm.S"
#include "neon.S"
function ff_vp8_luma_dc_wht_neon, export=1
ld1 {v0.4h - v3.4h}, [x1]
movi v30.8h, #0
add v4.4h, v0.4h, v3.4h
add v6.4h, v1.4h, v2.4h
st1 {v30.8h}, [x1], #16
sub v7.4h, v1.4h, v2.4h
sub v5.4h, v0.4h, v3.4h
st1 {v30.8h}, [x1]
add v0.4h, v4.4h, v6.4h
add v1.4h, v5.4h, v7.4h
sub v2.4h, v4.4h, v6.4h
sub v3.4h, v5.4h, v7.4h
movi v16.4h, #3
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
add v0.4h, v0.4h, v16.4h
add v4.4h, v0.4h, v3.4h
add v6.4h, v1.4h, v2.4h
sub v7.4h, v1.4h, v2.4h
sub v5.4h, v0.4h, v3.4h
add v0.4h, v4.4h, v6.4h
add v1.4h, v5.4h, v7.4h
sub v2.4h, v4.4h, v6.4h
sub v3.4h, v5.4h, v7.4h
sshr v0.4h, v0.4h, #3
sshr v1.4h, v1.4h, #3
sshr v2.4h, v2.4h, #3
sshr v3.4h, v3.4h, #3
mov x3, #32
st1 {v0.h}[0], [x0], x3
st1 {v1.h}[0], [x0], x3
st1 {v2.h}[0], [x0], x3
st1 {v3.h}[0], [x0], x3
st1 {v0.h}[1], [x0], x3
st1 {v1.h}[1], [x0], x3
st1 {v2.h}[1], [x0], x3
st1 {v3.h}[1], [x0], x3
st1 {v0.h}[2], [x0], x3
st1 {v1.h}[2], [x0], x3
st1 {v2.h}[2], [x0], x3
st1 {v3.h}[2], [x0], x3
st1 {v0.h}[3], [x0], x3
st1 {v1.h}[3], [x0], x3
st1 {v2.h}[3], [x0], x3
st1 {v3.h}[3], [x0], x3
ret
endfunc
function ff_vp8_idct_add_neon, export=1
ld1 {v0.8b - v3.8b}, [x1]
mov w4, #20091
@ -102,6 +159,58 @@ function ff_vp8_idct_add_neon, export=1
ret
endfunc
function ff_vp8_idct_dc_add4uv_neon, export=1
movi v0.4h, #0
mov x3, #32
ld1r {v16.4h}, [x1]
st1 {v0.h}[0], [x1], x3
ld1r {v17.4h}, [x1]
st1 {v0.h}[0], [x1], x3
ld1r {v18.4h}, [x1]
st1 {v0.h}[0], [x1], x3
ld1r {v19.4h}, [x1]
st1 {v0.h}[0], [x1], x3
ins v16.d[1], v17.d[0]
ins v18.d[1], v19.d[0]
mov x3, x0
srshr v16.8h, v16.8h, #3 // dc >>= 3
ld1 {v0.8b}, [x0], x2
srshr v18.8h, v18.8h, #3
ld1 {v1.8b}, [x0], x2
uaddw v20.8h, v16.8h, v0.8b
ld1 {v2.8b}, [x0], x2
uaddw v0.8h, v16.8h, v1.8b
ld1 {v3.8b}, [x0], x2
uaddw v22.8h, v16.8h, v2.8b
ld1 {v4.8b}, [x0], x2
uaddw v2.8h, v16.8h, v3.8b
ld1 {v5.8b}, [x0], x2
uaddw v24.8h, v18.8h, v4.8b
ld1 {v6.8b}, [x0], x2
uaddw v4.8h, v18.8h, v5.8b
ld1 {v7.8b}, [x0], x2
uaddw v26.8h, v18.8h, v6.8b
sqxtun v20.8b, v20.8h
uaddw v6.8h, v18.8h, v7.8b
sqxtun v21.8b, v0.8h
sqxtun v22.8b, v22.8h
st1 {v20.8b}, [x3], x2
sqxtun v23.8b, v2.8h
st1 {v21.8b}, [x3], x2
sqxtun v24.8b, v24.8h
st1 {v22.8b}, [x3], x2
sqxtun v25.8b, v4.8h
st1 {v23.8b}, [x3], x2
sqxtun v26.8b, v26.8h
st1 {v24.8b}, [x3], x2
sqxtun v27.8b, v6.8h
st1 {v25.8b}, [x3], x2
st1 {v26.8b}, [x3], x2
st1 {v27.8b}, [x3], x2
ret
endfunc
function ff_vp8_idct_dc_add4y_neon, export=1
movi v0.16b, #0
mov x3, #32

Loading…
Cancel
Save