avfilter/vf_bwdif: Add neon for filter_line

Exports C filter_line needed for tail fixup of neon code
Adds neon for filter_line

Signed-off-by: John Cox <jc@kynesim.co.uk>
Signed-off-by: Martin Storsjö <martin@martin.st>
pull/389/head
John Cox 2 years ago committed by Martin Storsjö
parent 8130df83e0
commit 94cb94a2c0
  1. 21
      libavfilter/aarch64/vf_bwdif_init_aarch64.c
  2. 203
      libavfilter/aarch64/vf_bwdif_neon.S
  3. 5
      libavfilter/bwdif.h
  4. 10
      libavfilter/vf_bwdif.c

@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs, void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
int prefs3, int mrefs3, int parity, int clip_max); int prefs3, int mrefs3, int parity, int clip_max);
void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int mrefs2,
int prefs3, int mrefs3, int prefs4, int mrefs4,
int parity, int clip_max);
static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int mrefs2,
int prefs3, int mrefs3, int prefs4, int mrefs4,
int parity, int clip_max)
{
const int w0 = clip_max != 255 ? 0 : w & ~15;
ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
if (w0 < w)
ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
}
static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1, static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int mrefs2, int w, int prefs, int mrefs, int prefs2, int mrefs2,
@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
return; return;
s->filter_intra = filter_intra_helper; s->filter_intra = filter_intra_helper;
s->filter_line = filter_line_helper;
s->filter_edge = filter_edge_helper; s->filter_edge = filter_edge_helper;
} }

@ -154,6 +154,209 @@ const coeffs, align=4 // align 4 means align on 2^4 boundry
.hword 5077, 981 // sp[0] = v0.h[6] .hword 5077, 981 // sp[0] = v0.h[6]
endconst endconst
// ===========================================================================
//
// void filter_line(
// void *dst1, // x0
// void *prev1, // x1
// void *cur1, // x2
// void *next1, // x3
// int w, // w4
// int prefs, // w5
// int mrefs, // w6
// int prefs2, // w7
// int mrefs2, // [sp, #0]
// int prefs3, // [sp, #SP_INT]
// int mrefs3, // [sp, #SP_INT*2]
// int prefs4, // [sp, #SP_INT*3]
// int mrefs4, // [sp, #SP_INT*4]
// int parity, // [sp, #SP_INT*5]
// int clip_max) // [sp, #SP_INT*6]
function ff_bwdif_filter_line_neon, export=1
// Sanity check w
cmp w4, #0
ble 99f
// Rearrange regs to be the same as line3 for ease of debug!
mov w10, w4 // w10 = loop count
mov w9, w6 // w9 = mref
mov w12, w7 // w12 = pref2
mov w11, w5 // w11 = pref
ldr w8, [sp, #0] // w8 = mref2
ldr w7, [sp, #SP_INT*2] // w7 = mref3
ldr w6, [sp, #SP_INT*4] // w6 = mref4
ldr w13, [sp, #SP_INT] // w13 = pref3
ldr w14, [sp, #SP_INT*3] // w14 = pref4
mov x4, x3
mov x3, x2
mov x2, x1
LDR_COEFFS v0, x17
// #define prev2 cur
// const uint8_t * restrict next2 = parity ? prev : next;
ldr w17, [sp, #SP_INT*5] // parity
cmp w17, #0
csel x17, x2, x4, ne
PUSH_VREGS
// for (x = 0; x < w; x++) {
// int diff0, diff2;
// int d0, d2;
// int temporal_diff0, temporal_diff2;
//
// int i1, i2;
// int j1, j2;
// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
10:
// c0 = prev2[0] + next2[0]; // c0 = v20, v21
// d0 = c0 >> 1; // d0 = v10
// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
ldr q31, [x3]
ldr q21, [x17]
uhadd v10.16b, v31.16b, v21.16b
uabd v11.16b, v31.16b, v21.16b
uaddl v20.8h, v21.8b, v31.8b
uaddl2 v21.8h, v21.16b, v31.16b
ldr q31, [x3, w6, sxtw]
ldr q23, [x17, w6, sxtw]
// i1 = coef_hf[0] * c0; // i1 = v2-v5
UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2]
ldr q30, [x3, w14, sxtw]
ldr q25, [x17, w14, sxtw]
// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23
uaddl v22.8h, v23.8b, v31.8b
uaddl2 v23.8h, v23.16b, v31.16b
// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12
uhadd v12.16b, v25.16b, v30.16b
uaddl v24.8h, v25.8b, v30.8b
uaddl2 v25.8h, v25.16b, v30.16b
// m3 = cur[mrefs3]; // m3 = v20
ldr q20, [x3, w7, sxtw]
// p3 = cur[prefs3]; // p3 = v21
ldr q21, [x3, w13, sxtw]
// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25)
add v22.8h, v22.8h, v24.8h
add v23.8h, v23.8h, v25.8h
UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4]
ldr q29, [x3, w8, sxtw]
ldr q23, [x17, w8, sxtw]
// i1 -= coef_lf[1] * 4 * (m3 + p3); // -
uaddl v30.8h, v20.8b, v21.8b
uaddl2 v31.8h, v20.16b, v21.16b
UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1]
ldr q31, [x3, w12, sxtw]
ldr q27, [x17, w12, sxtw]
// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13
uhadd v13.16b, v23.16b, v29.16b
uaddl v22.8h, v23.8b, v29.8b
uaddl2 v23.8h, v23.16b, v29.16b
// m1 = cur[mrefs]; // m1 = v24
ldr q24, [x3, w9, sxtw]
// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27
// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
// d2 = p2 >> 1; // d2 = v15
uabd v14.16b, v31.16b, v27.16b
uhadd v15.16b, v31.16b, v27.16b
uaddl v26.8h, v27.8b, v31.8b
uaddl2 v27.8h, v27.16b, v31.16b
// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*)
add v22.8h, v22.8h, v26.8h
add v23.8h, v23.8h, v27.8h
UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3]
// p1 = cur[prefs]; // p1 = v22
ldr q22, [x3, w11, sxtw]
// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
uaddl v18.8h, v22.8b, v24.8b
uaddl2 v19.8h, v22.16b, v24.16b
UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6]
uaddl v18.8h, v20.8b, v21.8b
uaddl2 v19.8h, v20.16b, v21.16b
UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7]
SQSHRUNN v17, v28, v29, v30, v31, 13
// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24
uaddl v26.8h, v24.8b, v22.8b
uaddl2 v27.8h, v24.16b, v22.16b
UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0]
ldr q31, [x2, w9, sxtw]
ldr q29, [x4, w9, sxtw]
ldr q30, [x2, w11, sxtw]
ldr q28, [x4, w11, sxtw]
// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5*
SQSHRUNN v2, v2, v3, v4, v5, 15
// {
// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
uabd v30.16b, v22.16b, v30.16b
uabd v31.16b, v24.16b, v31.16b
uabd v28.16b, v22.16b, v28.16b
uabd v29.16b, v24.16b, v29.16b
uhadd v31.16b, v31.16b, v30.16b
uhadd v29.16b, v29.16b, v28.16b
// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
ushr v18.16b, v11.16b, #1
umax v18.16b, v18.16b, v31.16b
umax v18.16b, v18.16b, v29.16b
// diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
// i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
// dst[0] = av_clip_uint8(interpol);
str q2, [x0], #16
// }
//
// dst++;
// cur++;
// prev++;
// prev2++;
// next++;
// }
subs w10, w10, #16
add x2, x2, #16
add x3, x3, #16
add x4, x4, #16
add x17, x17, #16
bgt 10b
POP_VREGS
99:
ret
endfunc
// ============================================================================ // ============================================================================
// //
// void ff_bwdif_filter_edge_neon( // void ff_bwdif_filter_edge_neon(

@ -48,4 +48,9 @@ void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs, void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
int prefs3, int mrefs3, int parity, int clip_max); int prefs3, int mrefs3, int parity, int clip_max);
void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int mrefs2,
int prefs3, int mrefs3, int prefs4, int mrefs4,
int parity, int clip_max);
#endif /* AVFILTER_BWDIF_H */ #endif /* AVFILTER_BWDIF_H */

@ -132,10 +132,10 @@ void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs
FILTER_INTRA() FILTER_INTRA()
} }
static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1, void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int mrefs2, int w, int prefs, int mrefs, int prefs2, int mrefs2,
int prefs3, int mrefs3, int prefs4, int mrefs4, int prefs3, int mrefs3, int prefs4, int mrefs4,
int parity, int clip_max) int parity, int clip_max)
{ {
uint8_t *dst = dst1; uint8_t *dst = dst1;
uint8_t *prev = prev1; uint8_t *prev = prev1;
@ -363,7 +363,7 @@ av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
s->filter_edge = filter_edge_16bit; s->filter_edge = filter_edge_16bit;
} else { } else {
s->filter_intra = ff_bwdif_filter_intra_c; s->filter_intra = ff_bwdif_filter_intra_c;
s->filter_line = filter_line_c; s->filter_line = ff_bwdif_filter_line_c;
s->filter_edge = ff_bwdif_filter_edge_c; s->filter_edge = ff_bwdif_filter_edge_c;
} }

Loading…
Cancel
Save