FFmpeg/libavcodec/riscv/vp7dsp_rvv.S

/*
 * Copyright (c) 2024 Rémi Denis-Courmont.
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/riscv/asm.S"

#if __riscv_xlen >= 64
func ff_vp7_luma_dc_wht_rvv, zve32x
        li          a2, 4 * 16 * 2
        li          a7, 16 * 2
        jal         t0, 1f
        vsse16.v    v4, (a0), a7
        vsse16.v    v5, (t1), a7
        vsse16.v    v6, (t2), a7
        vsse16.v    v7, (t3), a7
        ret
1:
        csrwi       vxrm, 0
        li          t4, 12540
        vsetivli    zero, 4, e16, mf2, ta, ma
        vlseg4e16.v v0, (a1)
        li          t6, 30274
        vwmul.vx    v8, v1, t4
        li          t5, 23170
        vwmul.vx    v9, v3, t6
        addi        t1, sp, -12 * 2
        vwmul.vx    v10, v1, t6
        addi        t2, sp, -8 * 2
        vwmul.vx    v11, v3, t4
        addi        t3, sp, -4 * 2
        vwadd.vv    v4, v0, v2
        addi        sp, sp, -16 * 2
        vwsub.vv    v5, v0, v2
        vsetvli     zero, zero, e32, m1, ta, ma
        vadd.vv     v7, v10, v11
        vmul.vx     v4, v4, t5
        vsub.vv     v6, v8, v9
        vmul.vx     v5, v5, t5
        vadd.vv     v0, v4, v7
        vsub.vv     v3, v4, v7
        vadd.vv     v1, v5, v6
        vsub.vv     v2, v5, v6
        vsetvli     zero, zero, e16, mf2, ta, ma
        vnsra.wi    v4, v0, 14
        vnsra.wi    v7, v3, 14
        vnsra.wi    v5, v1, 14
        vnsra.wi    v6, v2, 14
        vsseg4e16.v v4, (sp)
        vle16.v     v0, (sp)
        vle16.v     v1, (t1)
        vle16.v     v2, (t2)
        vle16.v     v3, (t3)
        vwmul.vx    v8, v1, t4
        vwmul.vx    v9, v3, t6
        add         t1, a2, a0
        vwmul.vx    v10, v1, t6
        sh1add      t2, a2, a0
        vwmul.vx    v11, v3, t4
        sh1add      a2, a2, a2 # a2 *= 3
        vwadd.vv    v4, v0, v2
        add         t3, a2, a0
        vwsub.vv    v5, v0, v2
        vsetvli     zero, zero, e32, m1, ta, ma
        vmul.vx     v4, v4, t5
        sd          zero,   (a1)
        vadd.vv     v7, v10, v11
        sd          zero,  8(a1)
        vmul.vx     v5, v5, t5
        sd          zero, 16(a1)
        vsub.vv     v6, v8, v9
        sd          zero, 24(a1)
        vadd.vv     v0, v4, v7
        addi        sp, sp, 16 * 2
        vsub.vv     v3, v4, v7
        vadd.vv     v1, v5, v6
        vsub.vv     v2, v5, v6
        vsetvli     zero, zero, e16, mf2, ta, ma
        vnclip.wi   v4, v0, 18
        vnclip.wi   v5, v1, 18
        vnclip.wi   v6, v2, 18
        vnclip.wi   v7, v3, 18
        jr          t0
endfunc

func ff_vp7_idct_add_rvv, zve32x
        jal         t0, 1b
        csrwi       vxrm, 2
        vsetvli     zero, zero, e8, mf4, ta, ma
        vle8.v      v12, (a0)
        vle8.v      v13, (t1)
        vwaddu.wv   v4, v4, v12
        vle8.v      v14, (t2)
        vwaddu.wv   v5, v5, v13
        vle8.v      v15, (t3)
        vwaddu.wv   v6, v6, v14
        vwaddu.wv   v7, v7, v15
        vsetvli     zero, zero, e16, mf2, ta, ma
        vmax.vx     v4, v4, zero
        vmax.vx     v5, v5, zero
        vmax.vx     v6, v6, zero
        vmax.vx     v7, v7, zero
        vsetvli     zero, zero, e8, mf4, ta, ma
        vnclipu.wi  v0, v4, 0
        vnclipu.wi  v1, v5, 0
        vse8.v      v0, (a0)
        vnclipu.wi  v2, v6, 0
        vse8.v      v1, (t1)
        vnclipu.wi  v3, v7, 0
        vse8.v      v2, (t2)
        vse8.v      v3, (t3)
        ret
endfunc
#endif

.irp type, y, uv
func ff_vp7_idct_dc_add4\type\()_rvv, zve32x
        li       t0, 32
        vsetivli zero, 4, e16, mf2, ta, ma
        li       t1, 23170
        vlse16.v v8, (a1), t0 # block[0..3][0]
        vwmul.vx v0, v8, t1
        li       t2, 0x20000 - (128 << 18)
        vsetvli  zero, zero, e32, m1, ta, ma
        vsra.vi  v0, v0, 14
        vmul.vx  v0, v0, t1
        vadd.vx  v0, v0, t2
        vsetvli  zero, zero, e16, mf2, ta, ma
        vnsra.wi v8, v0, 18   # 4x DC
        tail     ff_vp78_idct_dc_add4\type\()_rvv
endfunc
.endr
lavc/vp7dsp: add R-V V vp7_luma_dc_wht This works out a bit more favourably than VP8's due to: - additional multiplications that can be vectored, - hardware-supported fixed-point rounding mode. vp7_luma_dc_wht_c: 3.2 vp7_luma_dc_wht_rvv_i64: 2.0 7 months ago			`/*`
			`* Copyright (c) 2024 Rémi Denis-Courmont.`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "libavutil/riscv/asm.S"`

			`#if __riscv_xlen >= 64`
			`func ff_vp7_luma_dc_wht_rvv, zve32x`
lavc/vp7dsp: revector ff_vp7_dc_wht_rvv This prepares for some code reuse. 7 months ago			`li a2, 4 * 16 * 2`
			`li a7, 16 * 2`
			`jal t0, 1f`
			`vsse16.v v4, (a0), a7`
			`vsse16.v v5, (t1), a7`
			`vsse16.v v6, (t2), a7`
			`vsse16.v v7, (t3), a7`
			`ret`
			`1:`
lavc/vp7dsp: add R-V V vp7_luma_dc_wht This works out a bit more favourably than VP8's due to: - additional multiplications that can be vectored, - hardware-supported fixed-point rounding mode. vp7_luma_dc_wht_c: 3.2 vp7_luma_dc_wht_rvv_i64: 2.0 7 months ago			`csrwi vxrm, 0`
			`li t4, 12540`
			`vsetivli zero, 4, e16, mf2, ta, ma`
			`vlseg4e16.v v0, (a1)`
			`li t6, 30274`
			`vwmul.vx v8, v1, t4`
			`li t5, 23170`
			`vwmul.vx v9, v3, t6`
			`addi t1, sp, -12 * 2`
			`vwmul.vx v10, v1, t6`
			`addi t2, sp, -8 * 2`
			`vwmul.vx v11, v3, t4`
			`addi t3, sp, -4 * 2`
			`vwadd.vv v4, v0, v2`
			`addi sp, sp, -16 * 2`
			`vwsub.vv v5, v0, v2`
			`vsetvli zero, zero, e32, m1, ta, ma`
			`vadd.vv v7, v10, v11`
			`vmul.vx v4, v4, t5`
			`vsub.vv v6, v8, v9`
			`vmul.vx v5, v5, t5`
			`vadd.vv v0, v4, v7`
			`vsub.vv v3, v4, v7`
			`vadd.vv v1, v5, v6`
			`vsub.vv v2, v5, v6`
			`vsetvli zero, zero, e16, mf2, ta, ma`
			`vnsra.wi v4, v0, 14`
			`vnsra.wi v7, v3, 14`
			`vnsra.wi v5, v1, 14`
			`vnsra.wi v6, v2, 14`
			`vsseg4e16.v v4, (sp)`
			`vle16.v v0, (sp)`
			`vle16.v v1, (t1)`
			`vle16.v v2, (t2)`
			`vle16.v v3, (t3)`
			`vwmul.vx v8, v1, t4`
			`vwmul.vx v9, v3, t6`
lavc/vp7dsp: revector ff_vp7_dc_wht_rvv This prepares for some code reuse. 7 months ago			`add t1, a2, a0`
lavc/vp7dsp: add R-V V vp7_luma_dc_wht This works out a bit more favourably than VP8's due to: - additional multiplications that can be vectored, - hardware-supported fixed-point rounding mode. vp7_luma_dc_wht_c: 3.2 vp7_luma_dc_wht_rvv_i64: 2.0 7 months ago			`vwmul.vx v10, v1, t6`
lavc/vp7dsp: revector ff_vp7_dc_wht_rvv This prepares for some code reuse. 7 months ago			`sh1add t2, a2, a0`
lavc/vp7dsp: add R-V V vp7_luma_dc_wht This works out a bit more favourably than VP8's due to: - additional multiplications that can be vectored, - hardware-supported fixed-point rounding mode. vp7_luma_dc_wht_c: 3.2 vp7_luma_dc_wht_rvv_i64: 2.0 7 months ago			`vwmul.vx v11, v3, t4`
lavc/vp7dsp: revector ff_vp7_dc_wht_rvv This prepares for some code reuse. 7 months ago			`sh1add a2, a2, a2 # a2 *= 3`
lavc/vp7dsp: add R-V V vp7_luma_dc_wht This works out a bit more favourably than VP8's due to: - additional multiplications that can be vectored, - hardware-supported fixed-point rounding mode. vp7_luma_dc_wht_c: 3.2 vp7_luma_dc_wht_rvv_i64: 2.0 7 months ago			`vwadd.vv v4, v0, v2`
lavc/vp7dsp: revector ff_vp7_dc_wht_rvv This prepares for some code reuse. 7 months ago			`add t3, a2, a0`
lavc/vp7dsp: add R-V V vp7_luma_dc_wht This works out a bit more favourably than VP8's due to: - additional multiplications that can be vectored, - hardware-supported fixed-point rounding mode. vp7_luma_dc_wht_c: 3.2 vp7_luma_dc_wht_rvv_i64: 2.0 7 months ago			`vwsub.vv v5, v0, v2`
			`vsetvli zero, zero, e32, m1, ta, ma`
			`vmul.vx v4, v4, t5`
			`sd zero, (a1)`
			`vadd.vv v7, v10, v11`
			`sd zero, 8(a1)`
			`vmul.vx v5, v5, t5`
			`sd zero, 16(a1)`
			`vsub.vv v6, v8, v9`
			`sd zero, 24(a1)`
			`vadd.vv v0, v4, v7`
			`addi sp, sp, 16 * 2`
			`vsub.vv v3, v4, v7`
			`vadd.vv v1, v5, v6`
			`vsub.vv v2, v5, v6`
			`vsetvli zero, zero, e16, mf2, ta, ma`
			`vnclip.wi v4, v0, 18`
			`vnclip.wi v5, v1, 18`
			`vnclip.wi v6, v2, 18`
			`vnclip.wi v7, v3, 18`
lavc/vp7dsp: revector ff_vp7_dc_wht_rvv This prepares for some code reuse. 7 months ago			`jr t0`
lavc/vp7dsp: add R-V V vp7_luma_dc_wht This works out a bit more favourably than VP8's due to: - additional multiplications that can be vectored, - hardware-supported fixed-point rounding mode. vp7_luma_dc_wht_c: 3.2 vp7_luma_dc_wht_rvv_i64: 2.0 7 months ago			`endfunc`
lavc/vp7dsp: R-V V vp7_idct_add Most of the code is shared with DC, thanks to minor earlier changes. vp7_idct_add_c: 5.2 vp7_idct_add_rvv_i32: 2.5 7 months ago
			`func ff_vp7_idct_add_rvv, zve32x`
			`jal t0, 1b`
			`csrwi vxrm, 2`
			`vsetvli zero, zero, e8, mf4, ta, ma`
			`vle8.v v12, (a0)`
			`vle8.v v13, (t1)`
			`vwaddu.wv v4, v4, v12`
			`vle8.v v14, (t2)`
			`vwaddu.wv v5, v5, v13`
			`vle8.v v15, (t3)`
			`vwaddu.wv v6, v6, v14`
			`vwaddu.wv v7, v7, v15`
			`vsetvli zero, zero, e16, mf2, ta, ma`
			`vmax.vx v4, v4, zero`
			`vmax.vx v5, v5, zero`
			`vmax.vx v6, v6, zero`
			`vmax.vx v7, v7, zero`
			`vsetvli zero, zero, e8, mf4, ta, ma`
			`vnclipu.wi v0, v4, 0`
			`vnclipu.wi v1, v5, 0`
			`vse8.v v0, (a0)`
			`vnclipu.wi v2, v6, 0`
			`vse8.v v1, (t1)`
			`vnclipu.wi v3, v7, 0`
			`vse8.v v2, (t2)`
			`vse8.v v3, (t3)`
			`ret`
			`endfunc`
lavc/vp7dsp: add R-V V vp7_luma_dc_wht This works out a bit more favourably than VP8's due to: - additional multiplications that can be vectored, - hardware-supported fixed-point rounding mode. vp7_luma_dc_wht_c: 3.2 vp7_luma_dc_wht_rvv_i64: 2.0 7 months ago			`#endif`
lavc/vp8dsp: add R-V V vp7_idct_dc_add4y As with idct_dc_add, most of the code is shared with, and replaces, the previous VP8 function. To improve performance, we break down the 16x4 matrix into 4 rows, rather than 4 squares. Thus strided loads and stores are avoided, and the 4 DC calculations are vectored. Unfortunately this requires a vector gather to splat the DC values, but overall this is still a win for performance: T-Head C908: vp7_idct_dc_add4y_c: 7.2 vp7_idct_dc_add4y_rvv_i32: 2.2 vp8_idct_dc_add4y_c: 6.2 vp8_idct_dc_add4y_rvv_i32: 2.2 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 SpacemiT X60: vp7_idct_dc_add4y_c: 6.2 vp7_idct_dc_add4y_rvv_i32: 2.0 vp8_idct_dc_add4y_c: 5.5 vp8_idct_dc_add4y_rvv_i32: 2.5 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 I also tried to provision the DC values using indexed loads. It ends up slower overall, especially for VP7, as we then have to compute 16 DC's instead of just 4. 7 months ago
lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv This is almost the same story as vp7_idct_add4y. We just have to use strided loads of 2 64-bit elements to account for the different data layout in memory. T-Head C908: vp7_idct_dc_add4uv_c: 7.5 vp7_idct_dc_add4uv_rvv_i64: 2.0 vp8_idct_dc_add4uv_c: 6.2 vp8_idct_dc_add4uv_rvv_i32: 2.2 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 SpacemiT X60: vp7_idct_dc_add4uv_c: 6.7 vp7_idct_dc_add4uv_rvv_i64: 2.2 vp8_idct_dc_add4uv_c: 5.7 vp8_idct_dc_add4uv_rvv_i32: 2.5 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 7 months ago			`.irp type, y, uv`
			`func ff_vp7_idct_dc_add4\type\()_rvv, zve32x`
lavc/vp8dsp: add R-V V vp7_idct_dc_add4y As with idct_dc_add, most of the code is shared with, and replaces, the previous VP8 function. To improve performance, we break down the 16x4 matrix into 4 rows, rather than 4 squares. Thus strided loads and stores are avoided, and the 4 DC calculations are vectored. Unfortunately this requires a vector gather to splat the DC values, but overall this is still a win for performance: T-Head C908: vp7_idct_dc_add4y_c: 7.2 vp7_idct_dc_add4y_rvv_i32: 2.2 vp8_idct_dc_add4y_c: 6.2 vp8_idct_dc_add4y_rvv_i32: 2.2 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 SpacemiT X60: vp7_idct_dc_add4y_c: 6.2 vp7_idct_dc_add4y_rvv_i32: 2.0 vp8_idct_dc_add4y_c: 5.5 vp8_idct_dc_add4y_rvv_i32: 2.5 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 I also tried to provision the DC values using indexed loads. It ends up slower overall, especially for VP7, as we then have to compute 16 DC's instead of just 4. 7 months ago			`li t0, 32`
			`vsetivli zero, 4, e16, mf2, ta, ma`
			`li t1, 23170`
			`vlse16.v v8, (a1), t0 # block[0..3][0]`
			`vwmul.vx v0, v8, t1`
lavc/vp8dsp: rework R-V V idct_dc_add4y DCT-related FFmpeg functions often add an unsigned 8-bit sample to a signed 16-bit coefficient, then clip the result back to an unsigned 8-bit value. RISC-V has no signed 16-bit to unsigned 8-bit clip, so instead our most common sequence is: VWADDU.WV set SEW to 16 bits VMAX.VV zero # clip negative values to 0 set SEW to 8 bits VNCLIPU.WI # clip values over 255 to 255 and narrow Here we use a different sequence which does not require toggling the vector type. This assumes that the wide addend vector is biased by -128: VWADDU.WV VNCLIP.WI # clip values to signed 8-bit and narrow VXOR.VX 0x80 # flip sign bit (convert signed to unsigned) Also the VMAX is effectively replaced by a VXOR of half-width. In this function, this comes for free as we anyway add a constant to the wide vector in the prologue. On C908, this has no observable effects. On X60, this improves microbenchmarks by about 20%. 7 months ago			`li t2, 0x20000 - (128 << 18)`
lavc/vp8dsp: add R-V V vp7_idct_dc_add4y As with idct_dc_add, most of the code is shared with, and replaces, the previous VP8 function. To improve performance, we break down the 16x4 matrix into 4 rows, rather than 4 squares. Thus strided loads and stores are avoided, and the 4 DC calculations are vectored. Unfortunately this requires a vector gather to splat the DC values, but overall this is still a win for performance: T-Head C908: vp7_idct_dc_add4y_c: 7.2 vp7_idct_dc_add4y_rvv_i32: 2.2 vp8_idct_dc_add4y_c: 6.2 vp8_idct_dc_add4y_rvv_i32: 2.2 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 SpacemiT X60: vp7_idct_dc_add4y_c: 6.2 vp7_idct_dc_add4y_rvv_i32: 2.0 vp8_idct_dc_add4y_c: 5.5 vp8_idct_dc_add4y_rvv_i32: 2.5 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 I also tried to provision the DC values using indexed loads. It ends up slower overall, especially for VP7, as we then have to compute 16 DC's instead of just 4. 7 months ago			`vsetvli zero, zero, e32, m1, ta, ma`
			`vsra.vi v0, v0, 14`
			`vmul.vx v0, v0, t1`
			`vadd.vx v0, v0, t2`
			`vsetvli zero, zero, e16, mf2, ta, ma`
			`vnsra.wi v8, v0, 18 # 4x DC`
lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv This is almost the same story as vp7_idct_add4y. We just have to use strided loads of 2 64-bit elements to account for the different data layout in memory. T-Head C908: vp7_idct_dc_add4uv_c: 7.5 vp7_idct_dc_add4uv_rvv_i64: 2.0 vp8_idct_dc_add4uv_c: 6.2 vp8_idct_dc_add4uv_rvv_i32: 2.2 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 SpacemiT X60: vp7_idct_dc_add4uv_c: 6.7 vp7_idct_dc_add4uv_rvv_i64: 2.2 vp8_idct_dc_add4uv_c: 5.7 vp8_idct_dc_add4uv_rvv_i32: 2.5 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 7 months ago			`tail ff_vp78_idct_dc_add4\type\()_rvv`
lavc/vp8dsp: add R-V V vp7_idct_dc_add4y As with idct_dc_add, most of the code is shared with, and replaces, the previous VP8 function. To improve performance, we break down the 16x4 matrix into 4 rows, rather than 4 squares. Thus strided loads and stores are avoided, and the 4 DC calculations are vectored. Unfortunately this requires a vector gather to splat the DC values, but overall this is still a win for performance: T-Head C908: vp7_idct_dc_add4y_c: 7.2 vp7_idct_dc_add4y_rvv_i32: 2.2 vp8_idct_dc_add4y_c: 6.2 vp8_idct_dc_add4y_rvv_i32: 2.2 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 SpacemiT X60: vp7_idct_dc_add4y_c: 6.2 vp7_idct_dc_add4y_rvv_i32: 2.0 vp8_idct_dc_add4y_c: 5.5 vp8_idct_dc_add4y_rvv_i32: 2.5 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 I also tried to provision the DC values using indexed loads. It ends up slower overall, especially for VP7, as we then have to compute 16 DC's instead of just 4. 7 months ago			`endfunc`
lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv This is almost the same story as vp7_idct_add4y. We just have to use strided loads of 2 64-bit elements to account for the different data layout in memory. T-Head C908: vp7_idct_dc_add4uv_c: 7.5 vp7_idct_dc_add4uv_rvv_i64: 2.0 vp8_idct_dc_add4uv_c: 6.2 vp8_idct_dc_add4uv_rvv_i32: 2.2 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 SpacemiT X60: vp7_idct_dc_add4uv_c: 6.7 vp7_idct_dc_add4uv_rvv_i64: 2.2 vp8_idct_dc_add4uv_c: 5.7 vp8_idct_dc_add4uv_rvv_i32: 2.5 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 7 months ago			`.endr`