mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
162 lines
4.9 KiB
162 lines
4.9 KiB
/* |
|
* Loongson LSX optimized add_residual functions for HEVC decoding |
|
* |
|
* Copyright (c) 2023 Loongson Technology Corporation Limited |
|
* Contributed by jinbo <jinbo@loongson.cn> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "loongson_asm.S" |
|
|
|
/* |
|
* void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) |
|
*/ |
|
.macro ADD_RES_LSX_4x4_8 |
|
vldrepl.w vr0, a0, 0 |
|
add.d t0, a0, a2 |
|
vldrepl.w vr1, t0, 0 |
|
vld vr2, a1, 0 |
|
|
|
vilvl.w vr1, vr1, vr0 |
|
vsllwil.hu.bu vr1, vr1, 0 |
|
vadd.h vr1, vr1, vr2 |
|
vssrani.bu.h vr1, vr1, 0 |
|
|
|
vstelm.w vr1, a0, 0, 0 |
|
vstelm.w vr1, t0, 0, 1 |
|
.endm |
|
|
|
function ff_hevc_add_residual4x4_8_lsx |
|
ADD_RES_LSX_4x4_8 |
|
alsl.d a0, a2, a0, 1 |
|
addi.d a1, a1, 16 |
|
ADD_RES_LSX_4x4_8 |
|
endfunc |
|
|
|
/* |
|
* void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) |
|
*/ |
|
.macro ADD_RES_LSX_8x8_8 |
|
vldrepl.d vr0, a0, 0 |
|
add.d t0, a0, a2 |
|
vldrepl.d vr1, t0, 0 |
|
add.d t1, t0, a2 |
|
vldrepl.d vr2, t1, 0 |
|
add.d t2, t1, a2 |
|
vldrepl.d vr3, t2, 0 |
|
|
|
vld vr4, a1, 0 |
|
addi.d t3, zero, 16 |
|
vldx vr5, a1, t3 |
|
addi.d t4, a1, 32 |
|
vld vr6, t4, 0 |
|
vldx vr7, t4, t3 |
|
|
|
vsllwil.hu.bu vr0, vr0, 0 |
|
vsllwil.hu.bu vr1, vr1, 0 |
|
vsllwil.hu.bu vr2, vr2, 0 |
|
vsllwil.hu.bu vr3, vr3, 0 |
|
vadd.h vr0, vr0, vr4 |
|
vadd.h vr1, vr1, vr5 |
|
vadd.h vr2, vr2, vr6 |
|
vadd.h vr3, vr3, vr7 |
|
vssrani.bu.h vr1, vr0, 0 |
|
vssrani.bu.h vr3, vr2, 0 |
|
|
|
vstelm.d vr1, a0, 0, 0 |
|
vstelm.d vr1, t0, 0, 1 |
|
vstelm.d vr3, t1, 0, 0 |
|
vstelm.d vr3, t2, 0, 1 |
|
.endm |
|
|
|
function ff_hevc_add_residual8x8_8_lsx |
|
ADD_RES_LSX_8x8_8 |
|
alsl.d a0, a2, a0, 2 |
|
addi.d a1, a1, 64 |
|
ADD_RES_LSX_8x8_8 |
|
endfunc |
|
|
|
/* |
|
* void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) |
|
*/ |
|
function ff_hevc_add_residual16x16_8_lsx |
|
.rept 8 |
|
vld vr0, a0, 0 |
|
vldx vr2, a0, a2 |
|
|
|
vld vr4, a1, 0 |
|
addi.d t0, zero, 16 |
|
vldx vr5, a1, t0 |
|
addi.d t1, a1, 32 |
|
vld vr6, t1, 0 |
|
vldx vr7, t1, t0 |
|
|
|
vexth.hu.bu vr1, vr0 |
|
vsllwil.hu.bu vr0, vr0, 0 |
|
vexth.hu.bu vr3, vr2 |
|
vsllwil.hu.bu vr2, vr2, 0 |
|
vadd.h vr0, vr0, vr4 |
|
vadd.h vr1, vr1, vr5 |
|
vadd.h vr2, vr2, vr6 |
|
vadd.h vr3, vr3, vr7 |
|
|
|
vssrani.bu.h vr1, vr0, 0 |
|
vssrani.bu.h vr3, vr2, 0 |
|
|
|
vst vr1, a0, 0 |
|
vstx vr3, a0, a2 |
|
|
|
alsl.d a0, a2, a0, 1 |
|
addi.d a1, a1, 64 |
|
.endr |
|
endfunc |
|
|
|
/* |
|
* void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) |
|
*/ |
|
function ff_hevc_add_residual32x32_8_lsx |
|
.rept 32 |
|
vld vr0, a0, 0 |
|
addi.w t0, zero, 16 |
|
vldx vr2, a0, t0 |
|
|
|
vld vr4, a1, 0 |
|
vldx vr5, a1, t0 |
|
addi.d t1, a1, 32 |
|
vld vr6, t1, 0 |
|
vldx vr7, t1, t0 |
|
|
|
vexth.hu.bu vr1, vr0 |
|
vsllwil.hu.bu vr0, vr0, 0 |
|
vexth.hu.bu vr3, vr2 |
|
vsllwil.hu.bu vr2, vr2, 0 |
|
vadd.h vr0, vr0, vr4 |
|
vadd.h vr1, vr1, vr5 |
|
vadd.h vr2, vr2, vr6 |
|
vadd.h vr3, vr3, vr7 |
|
|
|
vssrani.bu.h vr1, vr0, 0 |
|
vssrani.bu.h vr3, vr2, 0 |
|
|
|
vst vr1, a0, 0 |
|
vstx vr3, a0, t0 |
|
|
|
add.d a0, a0, a2 |
|
addi.d a1, a1, 64 |
|
.endr |
|
endfunc
|
|
|