mirror of https://github.com/FFmpeg/FFmpeg.git
Simply taking the Zbb REV8 instruction into use in a simple loop gives some significant savings: bswap_buf_c: 1081.0 bswap_buf_rvb_b: 771.0 But we can also use the 64-bit REV8 as a pseudo-SIMD instruction with just one additional shift, and one fewer load, effectively doubling the bandwidth. Consequently, this patch is useful even if the compile-time target has Zbb enabled for C code: bswap_buf_c: 1081.0 bswap_buf_rvb_b: 341.0 (this patch) On the other hand, this approach fails miserably for bswap16_buf as the ratio of shifts and stores becomes unfavorable compared to naïve C: bswap16_buf_c: 1542.0 bswap16_buf_rvb_b: 1803.7 Unrolling to process 128 bits (4 samples) at a time actually worsens performance ever so slightly: bswap_buf_c: 1081.0 bswap_buf_rvb_b: 408.5pull/388/head
parent
37d5ddc317
commit
f0ef11ea83
5 changed files with 112 additions and 1 deletions
@ -0,0 +1,38 @@ |
||||
/*
|
||||
* Copyright © 2022 Rémi Denis-Courmont. |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or |
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either |
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software |
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include <stdint.h> |
||||
|
||||
#include "config.h" |
||||
#include "libavutil/attributes.h" |
||||
#include "libavutil/cpu.h" |
||||
#include "libavcodec/bswapdsp.h" |
||||
|
||||
void ff_bswap32_buf_rvb(uint32_t *dst, const uint32_t *src, int len); |
||||
|
||||
av_cold void ff_bswapdsp_init_riscv(BswapDSPContext *c) |
||||
{ |
||||
#if (__riscv_xlen >= 64) |
||||
int cpu_flags = av_get_cpu_flags(); |
||||
|
||||
if (cpu_flags & AV_CPU_FLAG_RVB_BASIC) |
||||
c->bswap_buf = ff_bswap32_buf_rvb; |
||||
#endif |
||||
} |
@ -0,0 +1,68 @@ |
||||
/* |
||||
* Copyright © 2022 Rémi Denis-Courmont. |
||||
* |
||||
* This file is part of FFmpeg. |
||||
* |
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public |
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version. |
||||
* |
||||
* FFmpeg is distributed in the hope that it will be useful, |
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
* Lesser General Public License for more details. |
||||
* |
||||
* You should have received a copy of the GNU Lesser General Public |
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||
*/ |
||||
|
||||
#include "config.h" |
||||
#include "libavutil/riscv/asm.S" |
||||
|
||||
#if (__riscv_xlen >= 64) |
||||
func ff_bswap32_buf_rvb, zbb |
||||
andi t0, a1, 4 |
||||
beqz t0, 1f |
||||
/* Align a1 (input) to 64-bit */ |
||||
lwu t0, (a1) |
||||
addi a0, a0, 4 |
||||
rev8 t0, t0 |
||||
addi a2, a2, -1 |
||||
srli t0, t0, __riscv_xlen - 32 |
||||
addi a1, a1, 4 |
||||
sw t0, -4(a0) |
||||
1: |
||||
andi a3, a2, -2 |
||||
sh2add a2, a2, a0 |
||||
beqz a3, 3f |
||||
sh2add a3, a3, a0 |
||||
2: /* 2 elements (64 bits) at a time on a 64-bit boundary */ |
||||
ld t0, (a1) |
||||
addi a0, a0, 8 |
||||
rev8 t0, t0 |
||||
#if (__riscv_xlen == 64) |
||||
srli t2, t0, 32 |
||||
sw t0, -4(a0) |
||||
#else |
||||
srli t1, t0, __riscv_xlen - 64 |
||||
srli t2, t0, __riscv_xlen - 32 |
||||
sw t1, -4(a0) |
||||
#endif |
||||
addi a1, a1, 8 |
||||
sw t2, -8(a0) |
||||
bne a0, a3, 2b |
||||
3: |
||||
beq a0, a2, 5f |
||||
4: /* Process last element */ |
||||
lwu t0, (a1) |
||||
addi a0, a0, 4 |
||||
rev8 t0, t0 |
||||
addi a1, a1, 4 |
||||
srli t0, t0, __riscv_xlen - 32 |
||||
sw t0, -4(a0) |
||||
5: |
||||
ret |
||||
endfunc |
||||
#endif |
Loading…
Reference in new issue