swscale/aarch64/rgb2rgb: add deinterleaveBytes neon implementation

A55               A76
deinterleave_bytes_c:             70342.0           34497.5
deinterleave_bytes_neon:          21594.5 ( 3.26x)   5535.2 ( 6.23x)
deinterleave_bytes_aligned_c:     71340.8           34651.2
deinterleave_bytes_aligned_neon:   8616.8 ( 8.28x)   3996.2 ( 8.67x)
release/7.1
Ramiro Polla 5 months ago
parent c08bb33e41
commit d8848325a6
  1. 4
      libswscale/aarch64/rgb2rgb.c
  2. 59
      libswscale/aarch64/rgb2rgb_neon.S

@ -30,6 +30,9 @@
void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2, void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height, uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride); int src1Stride, int src2Stride, int dstStride);
void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride);
av_cold void rgb2rgb_init_aarch64(void) av_cold void rgb2rgb_init_aarch64(void)
{ {
@ -37,5 +40,6 @@ av_cold void rgb2rgb_init_aarch64(void)
if (have_neon(cpu_flags)) { if (have_neon(cpu_flags)) {
interleaveBytes = ff_interleave_bytes_neon; interleaveBytes = ff_interleave_bytes_neon;
deinterleaveBytes = ff_deinterleave_bytes_neon;
} }
} }

@ -77,3 +77,62 @@ function ff_interleave_bytes_neon, export=1
0: 0:
ret ret
endfunc endfunc
// void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
// int width, int height, int srcStride,
// int dst1Stride, int dst2Stride);
function ff_deinterleave_bytes_neon, export=1
sub w5, w5, w3, lsl #1
sub w6, w6, w3
sub w7, w7, w3
1:
ands w8, w3, #0xfffffff0 // & ~15
b.eq 3f
2:
ld2 {v0.16b, v1.16b}, [x0], #32
subs w8, w8, #16
st1 {v0.16b}, [x1], #16
st1 {v1.16b}, [x2], #16
b.gt 2b
tst w3, #15
b.eq 9f
3:
tst w3, #8
b.eq 4f
ld2 {v0.8b, v1.8b}, [x0], #16
st1 {v0.8b}, [x1], #8
st1 {v1.8b}, [x2], #8
4:
tst w3, #4
b.eq 5f
ld1 {v0.8b}, [x0], #8
shrn v1.8b, v0.8h, #8
xtn v0.8b, v0.8h
st1 {v0.s}[0], [x1], #4
st1 {v1.s}[0], [x2], #4
5:
ands w8, w3, #3
b.eq 9f
6:
ldrh w9, [x0], #2
subs w8, w8, #1
ubfx w10, w9, #8, #8
strb w9, [x1], #1
strb w10, [x2], #1
b.gt 6b
9:
subs w4, w4, #1
b.eq 0f
add x0, x0, w5, sxtw
add x1, x1, w6, sxtw
add x2, x2, w7, sxtw
b 1b
0:
ret
endfunc

Loading…
Cancel
Save