sws/input: R-V V 32-bit RGB to halved UV

T-Head C908:
abgr_to_uv_half_8_c:            2.2
abgr_to_uv_half_8_rvv_i32:      3.5
abgr_to_uv_half_128_c:         44.0
abgr_to_uv_half_128_rvv_i32:   13.0
abgr_to_uv_half_1080_c:       245.0
abgr_to_uv_half_1080_rvv_i32: 107.2
abgr_to_uv_half_1920_c:       406.2
abgr_to_uv_half_1920_rvv_i32: 188.7
bgra_to_uv_half_8_c:            2.2
bgra_to_uv_half_8_rvv_i32:      3.5
bgra_to_uv_half_128_c:         26.5
bgra_to_uv_half_128_rvv_i32:   13.0
bgra_to_uv_half_1080_c:       219.7
bgra_to_uv_half_1080_rvv_i32: 107.0
bgra_to_uv_half_1920_c:       406.7
bgra_to_uv_half_1920_rvv_i32: 188.7

SpacemiT X60:
abgr_to_uv_half_8_c:           2.2
abgr_to_uv_half_8_rvv_i32:     3.0
abgr_to_uv_half_128_c:        28.2
abgr_to_uv_half_128_rvv_i32:   5.7
abgr_to_uv_half_1080_c:      235.5
abgr_to_uv_half_1080_rvv_i32: 47.7
abgr_to_uv_half_1920_c:      418.2
abgr_to_uv_half_1920_rvv_i32: 84.0
bgra_to_uv_half_8_c:           2.0
bgra_to_uv_half_8_rvv_i32:     3.0
bgra_to_uv_half_128_c:        23.7
bgra_to_uv_half_128_rvv_i32:   5.7
bgra_to_uv_half_1080_c:      195.5
bgra_to_uv_half_1080_rvv_i32: 47.7
bgra_to_uv_half_1920_c:      346.5
bgra_to_uv_half_1920_rvv_i32: 84.0
release/7.1
Rémi Denis-Courmont 5 months ago
parent e2f069905e
commit 7a3369398f
  1. 61
      libswscale/riscv/input_rvv.S
  2. 16
      libswscale/riscv/swscale.c

@ -242,6 +242,67 @@ func ff_\chr0\()ToUV_rvv, zve32x
ret
endfunc
func ff_\chr1\()ToUV_half_rvv, zve32x
lw t1, 20(a6) # BU
lw t4, 32(a6) # BV
lw t3, 12(a6) # RU
lw t6, 24(a6) # RV
j 1f
endfunc
func ff_\chr0\()ToUV_half_rvv, zve32x
lw t1, 12(a6) # RU
lw t4, 24(a6) # RV
lw t3, 20(a6) # BU
lw t6, 32(a6) # BV
1:
lw t2, 16(a6) # GU
lw t5, 28(a6) # GV
li a6, 0xff
li a7, (256 << 15) + (1 << (15 - 6))
2:
vsetvli t0, a5, e32, m4, ta, ma
vlseg2e32.v v0, (a3)
sub a5, a5, t0
.if \high
vsrl.vi v8, v0, 24
vsrl.vi v12, v4, 24
.else
vand.vx v8, v0, a6
vand.vx v12, v4, a6
.endif
sh3add a3, t0, a3
vsrl.vi v16, v0, 8 * (1 + \high)
vsrl.vi v20, v4, 8 * (1 + \high)
vsrl.vi v24, v0, 8 * (2 - \high)
vsrl.vi v28, v4, 8 * (2 - \high)
vand.vx v16, v16, a6
vand.vx v20, v20, a6
vand.vx v24, v24, a6
vand.vx v28, v28, a6
vadd.vv v8, v8, v12
vadd.vv v16, v16, v20
vadd.vv v24, v24, v28
vmul.vx v0, v8, t1
vmul.vx v4, v8, t4
vmacc.vx v0, t2, v16
vmacc.vx v4, t5, v16
vmacc.vx v0, t3, v24
vmacc.vx v4, t6, v24
vadd.vx v0, v0, a7
vadd.vx v4, v4, a7
vsetvli zero, zero, e16, m2, ta, ma
vnsra.wi v0, v0, 15 - 5
vnsra.wi v2, v4, 15 - 5
vse16.v v0, (a0)
sh1add a0, t0, a0
vse16.v v2, (a1)
sh1add a1, t0, a1
bnez a5, 2b
ret
endfunc
.endm
rgba_input rgba32, bgra32, 0

@ -47,13 +47,17 @@ av_cold void ff_sws_init_swscale_riscv(SwsContext *c)
switch (c->srcFormat) {
case AV_PIX_FMT_ABGR:
c->lumToYV12 = ff_abgr32ToY_rvv;
if (!c->chrSrcHSubSample)
if (c->chrSrcHSubSample)
c->chrToYV12 = ff_abgr32ToUV_half_rvv;
else
c->chrToYV12 = ff_abgr32ToUV_rvv;
break;
case AV_PIX_FMT_ARGB:
c->lumToYV12 = ff_argb32ToY_rvv;
if (!c->chrSrcHSubSample)
if (c->chrSrcHSubSample)
c->chrToYV12 = ff_argb32ToUV_half_rvv;
else
c->chrToYV12 = ff_argb32ToUV_rvv;
break;
@ -67,7 +71,9 @@ av_cold void ff_sws_init_swscale_riscv(SwsContext *c)
case AV_PIX_FMT_BGRA:
c->lumToYV12 = ff_bgra32ToY_rvv;
if (!c->chrSrcHSubSample)
if (c->chrSrcHSubSample)
c->chrToYV12 = ff_bgra32ToUV_half_rvv;
else
c->chrToYV12 = ff_bgra32ToUV_rvv;
break;
@ -81,7 +87,9 @@ av_cold void ff_sws_init_swscale_riscv(SwsContext *c)
case AV_PIX_FMT_RGBA:
c->lumToYV12 = ff_rgba32ToY_rvv;
if (!c->chrSrcHSubSample)
if (c->chrSrcHSubSample)
c->chrToYV12 = ff_rgba32ToUV_half_rvv;
else
c->chrToYV12 = ff_rgba32ToUV_rvv;
break;
}

Loading…
Cancel
Save