mirror of https://github.com/FFmpeg/FFmpeg.git
Since the horizontal and vertical filters are identical except for a transposition, this uses a common subprocedure with an ad-hoc ABI. To preserve return-address stack prediction, a link register has to be used (c.f. the "Control Transfer Instructions" from the RISC-V ISA Manual). The alternate/temporary link register T0 is used here, so that the normal RA is preserved (something Arm cannot do!). To load the strength value based on `qscale`, the shortest possible and PIC-compatible sequence is used: AUIPC; ADD; LBU. The classic LLA; ADD; LBU sequence would add one more instruction since LLA is a convenience alias for AUIPC; ADDI. To ensure that this trick works, relocation relaxation is disabled. To implement the two signed divisions by a power of two toward zero: (x / (1 << SHIFT)) the code relies on the small range of integers involved, computing: (x + (x >> (16 - SHIFT))) >> SHIFT rather than the more general: (x + ((x >> (16 - 1)) & ((1 << SHIFT) - 1))) >> SHIFT Thus one ANDI instruction is avoided. T-Head C908: h263dsp.h_loop_filter_c: 228.2 h263dsp.h_loop_filter_rvv_i32: 144.0 h263dsp.v_loop_filter_c: 242.7 h263dsp.v_loop_filter_rvv_i32: 114.0 (C is probably worse in real use due to less predictible branches.)release/7.1
parent
3d1597d3e2
commit
910d281b21
5 changed files with 147 additions and 1 deletions
@ -0,0 +1,41 @@ |
||||
/*
|
||||
* Copyright © 2022 Rémi Denis-Courmont. |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "config.h" |
||||
|
||||
#include "libavutil/attributes.h" |
||||
#include "libavutil/cpu.h" |
||||
#include "libavutil/riscv/cpu.h" |
||||
#include "libavcodec/h263dsp.h" |
||||
|
||||
void ff_h263_h_loop_filter_rvv(uint8_t *src, int stride, int q); |
||||
void ff_h263_v_loop_filter_rvv(uint8_t *src, int stride, int q); |
||||
|
||||
av_cold void ff_h263dsp_init_riscv(H263DSPContext *c) |
||||
{ |
||||
#if HAVE_RVV |
||||
int flags = av_get_cpu_flags(); |
||||
|
||||
if ((flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) { |
||||
c->h263_h_loop_filter = ff_h263_h_loop_filter_rvv; |
||||
c->h263_v_loop_filter = ff_h263_v_loop_filter_rvv; |
||||
} |
||||
#endif |
||||
} |
@ -0,0 +1,100 @@ |
||||
/* |
||||
* Copyright © 2024 Rémi Denis-Courmont. |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "libavutil/riscv/asm.S" |
||||
|
||||
.option push
|
||||
.option norelax
|
||||
func ff_h263_h_loop_filter_rvv, zve32x |
||||
addi a0, a0, -2 |
||||
vsetivli zero, 8, e8, mf2, ta, ma |
||||
vlsseg4e8.v v8, (a0), a1 |
||||
jal t0, 1f |
||||
vssseg4e8.v v8, (a0), a1 |
||||
ret |
||||
1: |
||||
csrwi vxrm, 0 |
||||
2: auipc t1, %pcrel_hi(ff_h263_loop_filter_strength) |
||||
vwsubu.vv v14, v10, v9 # p2 - p1 |
||||
add t1, t1, a2 |
||||
vwsubu.vv v12, v8, v11 # p0 - p3 |
||||
vsetvli zero, zero, e16, m1, ta, mu |
||||
vsll.vi v14, v14, 2 |
||||
lbu t1, %pcrel_lo(2b)(t1) # strength |
||||
vadd.vv v16, v12, v14 |
||||
# Divide by 8 toward 0. v16 is a signed 10-bit value at this point. |
||||
vsrl.vi v18, v16, 16 - 3 # v18 = (v16 < 0) ? 7 : 0 |
||||
slli t2, t1, 1 # 2 * strength |
||||
vadd.vv v16, v16, v18 |
||||
# v16 (d) is signed 7-bit, but later arithmetics require 9 bits. |
||||
vsra.vi v16, v16, 3 # d |
||||
vmv.v.x v20, t2 |
||||
vmslt.vi v0, v16, 0 |
||||
vneg.v v18, v16 |
||||
vneg.v v20, v20, v0.t # sign(d) * 2 * strength |
||||
vmax.vv v18, v16, v18 # |d| |
||||
vsub.vv v20, v20, v16 # d1 if strength <= |d| <= 2 * strength |
||||
vmsge.vx v0, v18, t2 |
||||
vsrl.vi v14, v12, 16 - 2 # v14 = (v12 < 0) ? 3 : 0 |
||||
vmerge.vxm v20, v20, zero, v0 # d1 if strength <= |d| |
||||
vadd.vv v12, v12, v14 |
||||
vmsge.vx v0, v18, t1 |
||||
vsra.vi v12, v12, 2 # (p0 - p3) / 4 |
||||
vmerge.vvm v16, v16, v20, v0 # d1 |
||||
vzext.vf2 v24, v8 # p0 as u16 (because vwrsubu.wv does not exist) |
||||
vneg.v v14, v16 |
||||
vzext.vf2 v26, v9 # p1 as u16 |
||||
vmax.vv v14, v16, v14 # |d1| |
||||
vzext.vf2 v28, v10 # p2 as u16 |
||||
vsra.vi v14, v14, 1 # ad1 |
||||
vadd.vv v26, v26, v16 # p1 + d1 |
||||
vneg.v v18, v14 # -ad1 |
||||
vmin.vv v12, v12, v14 |
||||
vsub.vv v28, v28, v16 # p2 - d1 |
||||
vmax.vv v12, v12, v18 # d2 |
||||
vmax.vx v26, v26, zero |
||||
vsub.vv v24, v24, v12 # p0 - d2 |
||||
vmax.vx v28, v28, zero |
||||
vsetvli zero, zero, e8, mf2, ta, ma |
||||
vwaddu.wv v30, v12, v11 # p3 + d2 |
||||
vncvt.x.x.w v8, v24 |
||||
vnclipu.wi v9, v26, 0 |
||||
vnclipu.wi v10, v28, 0 |
||||
vncvt.x.x.w v11, v30 |
||||
jr t0 |
||||
endfunc |
||||
.option pop
|
||||
|
||||
func ff_h263_v_loop_filter_rvv, zve32x |
||||
sub a4, a0, a1 |
||||
vsetivli zero, 8, e8, mf2, ta, ma |
||||
vle8.v v10, (a0) |
||||
sub a3, a4, a1 |
||||
vle8.v v9, (a4) |
||||
add a5, a0, a1 |
||||
vle8.v v8, (a3) |
||||
vle8.v v11, (a5) |
||||
jal t0, 1b |
||||
vse8.v v8, (a3) |
||||
vse8.v v9, (a4) |
||||
vse8.v v10, (a0) |
||||
vse8.v v11, (a5) |
||||
ret |
||||
endfunc |
Loading…
Reference in new issue