lavc/startcode: add R-V Zbb startcode_find_candidate

The main loop processes 8 bytes in 5 instructions.
For comparison, the optimal plain strnlen() requires 4 instructions per
byte (6.4x worse): LBU; ADDI; BEQZ; BNE. The current libavcodec C code
involves 5 instructions per byte (8x worse). Actual benchmarks may be
slightly less favourable due to latency from ORC.B to BNE.
release/7.1
Rémi Denis-Courmont 8 months ago
parent 8b8b555de0
commit 4ad5b9c8db
  1. 2
      libavcodec/h264dsp.c
  2. 2
      libavcodec/h264dsp.h
  3. 2
      libavcodec/riscv/Makefile
  4. 40
      libavcodec/riscv/h264dsp_init.c
  5. 83
      libavcodec/riscv/startcode_rvb.S
  6. 7
      libavcodec/riscv/vc1dsp_init.c

@ -158,6 +158,8 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
#elif ARCH_PPC
ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
#elif ARCH_RISCV
ff_h264dsp_init_riscv(c, bit_depth, chroma_format_idc);
#elif ARCH_X86
ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc);
#elif ARCH_MIPS

@ -125,6 +125,8 @@ void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc);
void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc);
void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc);
void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc);
void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth,

@ -28,6 +28,7 @@ OBJS-$(CONFIG_JPEG2000_DECODER) += riscv/jpeg2000dsp_init.o
RVV-OBJS-$(CONFIG_JPEG2000_DECODER) += riscv/jpeg2000dsp_rvv.o
OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
@ -51,6 +52,7 @@ OBJS-$(CONFIG_RV34DSP) += riscv/rv34dsp_init.o
RVV-OBJS-$(CONFIG_RV34DSP) += riscv/rv34dsp_rvv.o
OBJS-$(CONFIG_RV40_DECODER) += riscv/rv40dsp_init.o
RVV-OBJS-$(CONFIG_RV40_DECODER) += riscv/rv40dsp_rvv.o
RV-OBJS-$(CONFIG_STARTCODE) += riscv/startcode_rvb.o
OBJS-$(CONFIG_SVQ1_ENCODER) += riscv/svqenc_init.o
RVV-OBJS-$(CONFIG_SVQ1_ENCODER) += riscv/svqenc_rvv.o
OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_init.o

@ -0,0 +1,40 @@
/*
* Copyright © 2024 Rémi Denis-Courmont.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavcodec/h264dsp.h"
extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_RV
int flags = av_get_cpu_flags();
if (flags & AV_CPU_FLAG_RVB_BASIC)
dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
#endif
}

@ -0,0 +1,83 @@
/*
* Copyright © 2024 Rémi Denis-Courmont.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "libavutil/riscv/asm.S"
.macro lx rd, addr
#if (__riscv_xlen == 32)
lw \rd, \addr
#elif (__riscv_xlen == 64)
ld \rd, \addr
#else
lq \rd, \addr
#endif
.endm
func ff_startcode_find_candidate_rvb, zbb
add a1, a0, a1
// Potentially unaligned head
andi t0, a0, -(__riscv_xlen / 8)
beq a0, a1, 2f
andi t1, a0, (__riscv_xlen / 8) - 1
lx t2, (t0)
li t3, __riscv_xlen
orc.b t2, t2
slli t1, t1, 3
not t2, t2
sub t3, t3, t1
srl t2, t2, t1
addi t0, t0, __riscv_xlen / 8
sll t2, t2, t1
bnez t2, 4f
// Main loop (including potentially short tail)
bge t0, a1, 2f
li t3, -1
1:
lx t2, (t0)
addi t0, t0, __riscv_xlen / 8
orc.b t2, t2
bne t2, t3, 3f // t2 != -1 iff (at least one) zero byte
blt t0, a1, 1b
2: // No zero byte found
sub a0, a1, a0
ret
3: // Zero byte found in main loop
not t2, t2
4: // Zero byte found in head
ctz t2, t2
addi t0, t0, -(__riscv_xlen / 8) // back-track
srl t2, t2, 3
add t0, t0, t2
// Uncomment the following line for exact POSIX C strnlen() semantics.
//minu t0, t0, a1 // ignore zero byte in tail
sub a0, t0, a0
ret
endfunc

@ -33,6 +33,7 @@ void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_siz
void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
int ff_startcode_find_candidate_rvb(const uint8_t *, int);
av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
{
@ -45,7 +46,9 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi;
}
# endif
#if HAVE_RVV
if (flags & AV_CPU_FLAG_RVB_BASIC)
dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
# if HAVE_RVV
if (flags & AV_CPU_FLAG_RVV_I32 && ff_rv_vlen_least(128)) {
dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
@ -56,6 +59,6 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
}
}
#endif
# endif
#endif
}

Loading…
Cancel
Save