mirror of https://github.com/FFmpeg/FFmpeg.git
Simply taking the Zbb REV8 instruction into use in a simple loop gives some significant savings: bswap_buf_c: 1081.0 bswap_buf_rvb_b: 771.0 But we can also use the 64-bit REV8 as a pseudo-SIMD instruction with just one additional shift, and one fewer load, effectively doubling the bandwidth. Consequently, this patch is useful even if the compile-time target has Zbb enabled for C code: bswap_buf_c: 1081.0 bswap_buf_rvb_b: 341.0 (this patch) On the other hand, this approach fails miserably for bswap16_buf as the ratio of shifts and stores becomes unfavorable compared to naïve C: bswap16_buf_c: 1542.0 bswap16_buf_rvb_b: 1803.7 Unrolling to process 128 bits (4 samples) at a time actually worsens performance ever so slightly: bswap_buf_c: 1081.0 bswap_buf_rvb_b: 408.5pull/388/head
parent
37d5ddc317
commit
f0ef11ea83
5 changed files with 112 additions and 1 deletions
@ -0,0 +1,38 @@ |
|||||||
|
/*
|
||||||
|
* Copyright © 2022 Rémi Denis-Courmont. |
||||||
|
* |
||||||
|
* This file is part of FFmpeg. |
||||||
|
* |
||||||
|
* FFmpeg is free software; you can redistribute it and/or |
||||||
|
* modify it under the terms of the GNU Lesser General Public |
||||||
|
* License as published by the Free Software Foundation; either |
||||||
|
* version 2.1 of the License, or (at your option) any later version. |
||||||
|
* |
||||||
|
* FFmpeg is distributed in the hope that it will be useful, |
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
* Lesser General Public License for more details. |
||||||
|
* |
||||||
|
* You should have received a copy of the GNU Lesser General Public |
||||||
|
* License along with FFmpeg; if not, write to the Free Software |
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
*/ |
||||||
|
|
||||||
|
#include <stdint.h> |
||||||
|
|
||||||
|
#include "config.h" |
||||||
|
#include "libavutil/attributes.h" |
||||||
|
#include "libavutil/cpu.h" |
||||||
|
#include "libavcodec/bswapdsp.h" |
||||||
|
|
||||||
|
void ff_bswap32_buf_rvb(uint32_t *dst, const uint32_t *src, int len); |
||||||
|
|
||||||
|
av_cold void ff_bswapdsp_init_riscv(BswapDSPContext *c) |
||||||
|
{ |
||||||
|
#if (__riscv_xlen >= 64) |
||||||
|
int cpu_flags = av_get_cpu_flags(); |
||||||
|
|
||||||
|
if (cpu_flags & AV_CPU_FLAG_RVB_BASIC) |
||||||
|
c->bswap_buf = ff_bswap32_buf_rvb; |
||||||
|
#endif |
||||||
|
} |
@ -0,0 +1,68 @@ |
|||||||
|
/* |
||||||
|
* Copyright © 2022 Rémi Denis-Courmont. |
||||||
|
* |
||||||
|
* This file is part of FFmpeg. |
||||||
|
* |
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public |
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version. |
||||||
|
* |
||||||
|
* FFmpeg is distributed in the hope that it will be useful, |
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
* Lesser General Public License for more details. |
||||||
|
* |
||||||
|
* You should have received a copy of the GNU Lesser General Public |
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "config.h" |
||||||
|
#include "libavutil/riscv/asm.S" |
||||||
|
|
||||||
|
#if (__riscv_xlen >= 64) |
||||||
|
func ff_bswap32_buf_rvb, zbb |
||||||
|
andi t0, a1, 4 |
||||||
|
beqz t0, 1f |
||||||
|
/* Align a1 (input) to 64-bit */ |
||||||
|
lwu t0, (a1) |
||||||
|
addi a0, a0, 4 |
||||||
|
rev8 t0, t0 |
||||||
|
addi a2, a2, -1 |
||||||
|
srli t0, t0, __riscv_xlen - 32 |
||||||
|
addi a1, a1, 4 |
||||||
|
sw t0, -4(a0) |
||||||
|
1: |
||||||
|
andi a3, a2, -2 |
||||||
|
sh2add a2, a2, a0 |
||||||
|
beqz a3, 3f |
||||||
|
sh2add a3, a3, a0 |
||||||
|
2: /* 2 elements (64 bits) at a time on a 64-bit boundary */ |
||||||
|
ld t0, (a1) |
||||||
|
addi a0, a0, 8 |
||||||
|
rev8 t0, t0 |
||||||
|
#if (__riscv_xlen == 64) |
||||||
|
srli t2, t0, 32 |
||||||
|
sw t0, -4(a0) |
||||||
|
#else |
||||||
|
srli t1, t0, __riscv_xlen - 64 |
||||||
|
srli t2, t0, __riscv_xlen - 32 |
||||||
|
sw t1, -4(a0) |
||||||
|
#endif |
||||||
|
addi a1, a1, 8 |
||||||
|
sw t2, -8(a0) |
||||||
|
bne a0, a3, 2b |
||||||
|
3: |
||||||
|
beq a0, a2, 5f |
||||||
|
4: /* Process last element */ |
||||||
|
lwu t0, (a1) |
||||||
|
addi a0, a0, 4 |
||||||
|
rev8 t0, t0 |
||||||
|
addi a1, a1, 4 |
||||||
|
srli t0, t0, __riscv_xlen - 32 |
||||||
|
sw t0, -4(a0) |
||||||
|
5: |
||||||
|
ret |
||||||
|
endfunc |
||||||
|
#endif |
Loading…
Reference in new issue