mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
4573 lines
170 KiB
4573 lines
170 KiB
/* |
|
* Copyright (c) 2023 Loongson Technology Corporation Limited |
|
* Contributed by jinbo <jinbo@loongson.cn> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "loongson_asm.S" |
|
|
|
.extern ff_hevc_qpel_filters |
|
.extern ff_hevc_epel_filters |
|
|
|
.macro LOAD_VAR bit |
|
addi.w t1, a5, 6 //shift |
|
addi.w t3, zero, 1 //one |
|
sub.w t4, t1, t3 |
|
sll.w t3, t3, t4 //offset |
|
.if \bit == 128 |
|
vreplgr2vr.w vr1, a6 //wx |
|
vreplgr2vr.w vr2, t3 //offset |
|
vreplgr2vr.w vr3, t1 //shift |
|
vreplgr2vr.w vr4, a7 //ox |
|
.else |
|
xvreplgr2vr.w xr1, a6 |
|
xvreplgr2vr.w xr2, t3 |
|
xvreplgr2vr.w xr3, t1 |
|
xvreplgr2vr.w xr4, a7 |
|
.endif |
|
.endm |
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w |
|
vldrepl.d vr0, \src0, 0 |
|
vsllwil.hu.bu vr0, vr0, 0 |
|
vexth.wu.hu vr5, vr0 |
|
vsllwil.wu.hu vr0, vr0, 0 |
|
vslli.w vr0, vr0, 6 |
|
vslli.w vr5, vr5, 6 |
|
vmul.w vr0, vr0, vr1 |
|
vmul.w vr5, vr5, vr1 |
|
vadd.w vr0, vr0, vr2 |
|
vadd.w vr5, vr5, vr2 |
|
vsra.w vr0, vr0, vr3 |
|
vsra.w vr5, vr5, vr3 |
|
vadd.w vr0, vr0, vr4 |
|
vadd.w vr5, vr5, vr4 |
|
vssrani.h.w vr5, vr0, 0 |
|
vssrani.bu.h vr5, vr5, 0 |
|
.if \w == 6 |
|
fst.s f5, \dst0, 0 |
|
vstelm.h vr5, \dst0, 4, 2 |
|
.else |
|
fst.d f5, \dst0, 0 |
|
.endif |
|
.endm |
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w |
|
vldrepl.d vr0, \src0, 0 |
|
add.d t2, \src0, a3 |
|
vldrepl.d vr5, t2, 0 |
|
xvpermi.q xr0, xr5, 0x02 |
|
xvsllwil.hu.bu xr0, xr0, 0 |
|
xvexth.wu.hu xr5, xr0 |
|
xvsllwil.wu.hu xr0, xr0, 0 |
|
xvslli.w xr0, xr0, 6 |
|
xvslli.w xr5, xr5, 6 |
|
xvmul.w xr0, xr0, xr1 |
|
xvmul.w xr5, xr5, xr1 |
|
xvadd.w xr0, xr0, xr2 |
|
xvadd.w xr5, xr5, xr2 |
|
xvsra.w xr0, xr0, xr3 |
|
xvsra.w xr5, xr5, xr3 |
|
xvadd.w xr0, xr0, xr4 |
|
xvadd.w xr5, xr5, xr4 |
|
xvssrani.h.w xr5, xr0, 0 |
|
xvpermi.q xr0, xr5, 0x01 |
|
xvssrani.bu.h xr0, xr5, 0 |
|
add.d t3, \dst0, a1 |
|
.if \w == 6 |
|
vstelm.w vr0, \dst0, 0, 0 |
|
vstelm.h vr0, \dst0, 4, 2 |
|
vstelm.w vr0, t3, 0, 2 |
|
vstelm.h vr0, t3, 4, 6 |
|
.else |
|
vstelm.d vr0, \dst0, 0, 0 |
|
vstelm.d vr0, t3, 0, 1 |
|
.endif |
|
.endm |
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0 |
|
vld vr0, \src0, 0 |
|
vexth.hu.bu vr7, vr0 |
|
vexth.wu.hu vr8, vr7 |
|
vsllwil.wu.hu vr7, vr7, 0 |
|
vsllwil.hu.bu vr5, vr0, 0 |
|
vexth.wu.hu vr6, vr5 |
|
vsllwil.wu.hu vr5, vr5, 0 |
|
vslli.w vr5, vr5, 6 |
|
vslli.w vr6, vr6, 6 |
|
vslli.w vr7, vr7, 6 |
|
vslli.w vr8, vr8, 6 |
|
vmul.w vr5, vr5, vr1 |
|
vmul.w vr6, vr6, vr1 |
|
vmul.w vr7, vr7, vr1 |
|
vmul.w vr8, vr8, vr1 |
|
vadd.w vr5, vr5, vr2 |
|
vadd.w vr6, vr6, vr2 |
|
vadd.w vr7, vr7, vr2 |
|
vadd.w vr8, vr8, vr2 |
|
vsra.w vr5, vr5, vr3 |
|
vsra.w vr6, vr6, vr3 |
|
vsra.w vr7, vr7, vr3 |
|
vsra.w vr8, vr8, vr3 |
|
vadd.w vr5, vr5, vr4 |
|
vadd.w vr6, vr6, vr4 |
|
vadd.w vr7, vr7, vr4 |
|
vadd.w vr8, vr8, vr4 |
|
vssrani.h.w vr6, vr5, 0 |
|
vssrani.h.w vr8, vr7, 0 |
|
vssrani.bu.h vr8, vr6, 0 |
|
vst vr8, \dst0, 0 |
|
.endm |
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS16_LASX src0, dst0 |
|
vld vr0, \src0, 0 |
|
xvpermi.d xr0, xr0, 0xd8 |
|
xvsllwil.hu.bu xr0, xr0, 0 |
|
xvexth.wu.hu xr6, xr0 |
|
xvsllwil.wu.hu xr5, xr0, 0 |
|
xvslli.w xr5, xr5, 6 |
|
xvslli.w xr6, xr6, 6 |
|
xvmul.w xr5, xr5, xr1 |
|
xvmul.w xr6, xr6, xr1 |
|
xvadd.w xr5, xr5, xr2 |
|
xvadd.w xr6, xr6, xr2 |
|
xvsra.w xr5, xr5, xr3 |
|
xvsra.w xr6, xr6, xr3 |
|
xvadd.w xr5, xr5, xr4 |
|
xvadd.w xr6, xr6, xr4 |
|
xvssrani.h.w xr6, xr5, 0 |
|
xvpermi.q xr7, xr6, 0x01 |
|
xvssrani.bu.h xr7, xr6, 0 |
|
vst vr7, \dst0, 0 |
|
.endm |
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS32_LASX src0, dst0, w |
|
.if \w == 16 |
|
vld vr0, \src0, 0 |
|
add.d t2, \src0, a3 |
|
vld vr5, t2, 0 |
|
xvpermi.q xr0, xr5, 0x02 |
|
.else //w=24/32 |
|
xvld xr0, \src0, 0 |
|
.endif |
|
xvexth.hu.bu xr7, xr0 |
|
xvexth.wu.hu xr8, xr7 |
|
xvsllwil.wu.hu xr7, xr7, 0 |
|
xvsllwil.hu.bu xr5, xr0, 0 |
|
xvexth.wu.hu xr6, xr5 |
|
xvsllwil.wu.hu xr5, xr5, 0 |
|
xvslli.w xr5, xr5, 6 |
|
xvslli.w xr6, xr6, 6 |
|
xvslli.w xr7, xr7, 6 |
|
xvslli.w xr8, xr8, 6 |
|
xvmul.w xr5, xr5, xr1 |
|
xvmul.w xr6, xr6, xr1 |
|
xvmul.w xr7, xr7, xr1 |
|
xvmul.w xr8, xr8, xr1 |
|
xvadd.w xr5, xr5, xr2 |
|
xvadd.w xr6, xr6, xr2 |
|
xvadd.w xr7, xr7, xr2 |
|
xvadd.w xr8, xr8, xr2 |
|
xvsra.w xr5, xr5, xr3 |
|
xvsra.w xr6, xr6, xr3 |
|
xvsra.w xr7, xr7, xr3 |
|
xvsra.w xr8, xr8, xr3 |
|
xvadd.w xr5, xr5, xr4 |
|
xvadd.w xr6, xr6, xr4 |
|
xvadd.w xr7, xr7, xr4 |
|
xvadd.w xr8, xr8, xr4 |
|
xvssrani.h.w xr6, xr5, 0 |
|
xvssrani.h.w xr8, xr7, 0 |
|
xvssrani.bu.h xr8, xr6, 0 |
|
.if \w == 16 |
|
vst vr8, \dst0, 0 |
|
add.d t2, \dst0, a1 |
|
xvpermi.q xr8, xr8, 0x01 |
|
vst vr8, t2, 0 |
|
.elseif \w == 24 |
|
vst vr8, \dst0, 0 |
|
xvstelm.d xr8, \dst0, 16, 2 |
|
.else |
|
xvst xr8, \dst0, 0 |
|
.endif |
|
.endm |
|
|
|
/* |
|
* void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, |
|
* const uint8_t *_src, ptrdiff_t _srcstride, |
|
* int height, int denom, int wx, int ox, |
|
* intptr_t mx, intptr_t my, int width) |
|
*/ |
|
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx |
|
LOAD_VAR 128 |
|
srli.w t0, a4, 1 |
|
.LOOP_PIXELS4: |
|
vldrepl.w vr0, a2, 0 |
|
add.d t1, a2, a3 |
|
vldrepl.w vr5, t1, 0 |
|
vsllwil.hu.bu vr0, vr0, 0 |
|
vsllwil.wu.hu vr0, vr0, 0 |
|
vsllwil.hu.bu vr5, vr5, 0 |
|
vsllwil.wu.hu vr5, vr5, 0 |
|
vslli.w vr0, vr0, 6 |
|
vslli.w vr5, vr5, 6 |
|
vmul.w vr0, vr0, vr1 |
|
vmul.w vr5, vr5, vr1 |
|
vadd.w vr0, vr0, vr2 |
|
vadd.w vr5, vr5, vr2 |
|
vsra.w vr0, vr0, vr3 |
|
vsra.w vr5, vr5, vr3 |
|
vadd.w vr0, vr0, vr4 |
|
vadd.w vr5, vr5, vr4 |
|
vssrani.h.w vr5, vr0, 0 |
|
vssrani.bu.h vr5, vr5, 0 |
|
fst.s f5, a0, 0 |
|
add.d t2, a0, a1 |
|
vstelm.w vr5, t2, 0, 1 |
|
alsl.d a2, a3, a2, 1 |
|
alsl.d a0, a1, a0, 1 |
|
addi.w t0, t0, -1 |
|
bnez t0, .LOOP_PIXELS4 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx |
|
LOAD_VAR 128 |
|
.LOOP_PIXELS6: |
|
HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 6 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS6 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx |
|
LOAD_VAR 256 |
|
srli.w t0, a4, 1 |
|
.LOOP_PIXELS6_LASX: |
|
HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 6 |
|
alsl.d a2, a3, a2, 1 |
|
alsl.d a0, a1, a0, 1 |
|
addi.w t0, t0, -1 |
|
bnez t0, .LOOP_PIXELS6_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx |
|
LOAD_VAR 128 |
|
.LOOP_PIXELS8: |
|
HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 8 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS8 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx |
|
LOAD_VAR 256 |
|
srli.w t0, a4, 1 |
|
.LOOP_PIXELS8_LASX: |
|
HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 8 |
|
alsl.d a2, a3, a2, 1 |
|
alsl.d a0, a1, a0, 1 |
|
addi.w t0, t0, -1 |
|
bnez t0, .LOOP_PIXELS8_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx |
|
LOAD_VAR 128 |
|
.LOOP_PIXELS12: |
|
vld vr0, a2, 0 |
|
vexth.hu.bu vr7, vr0 |
|
vsllwil.wu.hu vr7, vr7, 0 |
|
vsllwil.hu.bu vr5, vr0, 0 |
|
vexth.wu.hu vr6, vr5 |
|
vsllwil.wu.hu vr5, vr5, 0 |
|
vslli.w vr5, vr5, 6 |
|
vslli.w vr6, vr6, 6 |
|
vslli.w vr7, vr7, 6 |
|
vmul.w vr5, vr5, vr1 |
|
vmul.w vr6, vr6, vr1 |
|
vmul.w vr7, vr7, vr1 |
|
vadd.w vr5, vr5, vr2 |
|
vadd.w vr6, vr6, vr2 |
|
vadd.w vr7, vr7, vr2 |
|
vsra.w vr5, vr5, vr3 |
|
vsra.w vr6, vr6, vr3 |
|
vsra.w vr7, vr7, vr3 |
|
vadd.w vr5, vr5, vr4 |
|
vadd.w vr6, vr6, vr4 |
|
vadd.w vr7, vr7, vr4 |
|
vssrani.h.w vr6, vr5, 0 |
|
vssrani.h.w vr7, vr7, 0 |
|
vssrani.bu.h vr7, vr6, 0 |
|
fst.d f7, a0, 0 |
|
vstelm.w vr7, a0, 8, 2 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx |
|
LOAD_VAR 256 |
|
.LOOP_PIXELS12_LASX: |
|
vld vr0, a2, 0 |
|
xvpermi.d xr0, xr0, 0xd8 |
|
xvsllwil.hu.bu xr0, xr0, 0 |
|
xvexth.wu.hu xr6, xr0 |
|
xvsllwil.wu.hu xr5, xr0, 0 |
|
xvslli.w xr5, xr5, 6 |
|
xvslli.w xr6, xr6, 6 |
|
xvmul.w xr5, xr5, xr1 |
|
xvmul.w xr6, xr6, xr1 |
|
xvadd.w xr5, xr5, xr2 |
|
xvadd.w xr6, xr6, xr2 |
|
xvsra.w xr5, xr5, xr3 |
|
xvsra.w xr6, xr6, xr3 |
|
xvadd.w xr5, xr5, xr4 |
|
xvadd.w xr6, xr6, xr4 |
|
xvssrani.h.w xr6, xr5, 0 |
|
xvpermi.q xr7, xr6, 0x01 |
|
xvssrani.bu.h xr7, xr6, 0 |
|
fst.d f7, a0, 0 |
|
vstelm.w vr7, a0, 8, 2 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS12_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx |
|
LOAD_VAR 128 |
|
.LOOP_PIXELS16: |
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx |
|
LOAD_VAR 256 |
|
srli.w t0, a4, 1 |
|
.LOOP_PIXELS16_LASX: |
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 16 |
|
alsl.d a2, a3, a2, 1 |
|
alsl.d a0, a1, a0, 1 |
|
addi.w t0, t0, -1 |
|
bnez t0, .LOOP_PIXELS16_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx |
|
LOAD_VAR 128 |
|
.LOOP_PIXELS24: |
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 |
|
addi.d t0, a2, 16 |
|
addi.d t1, a0, 16 |
|
HEVC_PEL_UNI_W_PIXELS8_LSX t0, t1, 8 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS24 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx |
|
LOAD_VAR 256 |
|
.LOOP_PIXELS24_LASX: |
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 24 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS24_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx |
|
LOAD_VAR 128 |
|
.LOOP_PIXELS32: |
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 |
|
addi.d t0, a2, 16 |
|
addi.d t1, a0, 16 |
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS32 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx |
|
LOAD_VAR 256 |
|
.LOOP_PIXELS32_LASX: |
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS32_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx |
|
LOAD_VAR 128 |
|
.LOOP_PIXELS48: |
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 |
|
addi.d t0, a2, 16 |
|
addi.d t1, a0, 16 |
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 |
|
addi.d t0, a2, 32 |
|
addi.d t1, a0, 32 |
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS48 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx |
|
LOAD_VAR 256 |
|
.LOOP_PIXELS48_LASX: |
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32 |
|
addi.d t0, a2, 32 |
|
addi.d t1, a0, 32 |
|
HEVC_PEL_UNI_W_PIXELS16_LASX t0, t1 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS48_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx |
|
LOAD_VAR 128 |
|
.LOOP_PIXELS64: |
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0 |
|
addi.d t0, a2, 16 |
|
addi.d t1, a0, 16 |
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 |
|
addi.d t0, a2, 32 |
|
addi.d t1, a0, 32 |
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 |
|
addi.d t0, a2, 48 |
|
addi.d t1, a0, 48 |
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS64 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx |
|
LOAD_VAR 256 |
|
.LOOP_PIXELS64_LASX: |
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32 |
|
addi.d t0, a2, 32 |
|
addi.d t1, a0, 32 |
|
HEVC_PEL_UNI_W_PIXELS32_LASX t0, t1, 32 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.w a4, a4, -1 |
|
bnez a4, .LOOP_PIXELS64_LASX |
|
endfunc |
|
|
|
.macro vhaddw.d.h in0 |
|
vhaddw.w.h \in0, \in0, \in0 |
|
vhaddw.d.w \in0, \in0, \in0 |
|
.endm |
|
|
|
.macro xvhaddw.d.h in0 |
|
xvhaddw.w.h \in0, \in0, \in0 |
|
xvhaddw.d.w \in0, \in0, \in0 |
|
.endm |
|
|
|
/* |
|
* void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, |
|
* const uint8_t *_src, ptrdiff_t _srcstride, |
|
* int height, int denom, int wx, int ox, |
|
* intptr_t mx, intptr_t my, int width) |
|
*/ |
|
function ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
fld.s f6, a2, 0 //0 |
|
fldx.s f7, a2, a3 //1 |
|
fldx.s f8, a2, t0 //2 |
|
add.d a2, a2, t1 |
|
fld.s f9, a2, 0 //3 |
|
fldx.s f10, a2, a3 //4 |
|
fldx.s f11, a2, t0 //5 |
|
fldx.s f12, a2, t1 //6 |
|
add.d a2, a2, t2 |
|
vilvl.b vr6, vr7, vr6 |
|
vilvl.b vr7, vr9, vr8 |
|
vilvl.b vr8, vr11, vr10 |
|
vilvl.b vr9, vr13, vr12 |
|
vilvl.h vr6, vr7, vr6 |
|
vilvl.h vr7, vr9, vr8 |
|
vilvl.w vr8, vr7, vr6 |
|
vilvh.w vr9, vr7, vr6 |
|
.LOOP_V4: |
|
fld.s f13, a2, 0 //7 |
|
fldx.s f14, a2, a3 //8 next loop |
|
add.d a2, a2, t0 |
|
vextrins.b vr8, vr13, 0x70 |
|
vextrins.b vr8, vr13, 0xf1 |
|
vextrins.b vr9, vr13, 0x72 |
|
vextrins.b vr9, vr13, 0xf3 |
|
vbsrl.v vr10, vr8, 1 |
|
vbsrl.v vr11, vr9, 1 |
|
vextrins.b vr10, vr14, 0x70 |
|
vextrins.b vr10, vr14, 0xf1 |
|
vextrins.b vr11, vr14, 0x72 |
|
vextrins.b vr11, vr14, 0xf3 |
|
vdp2.h.bu.b vr6, vr8, vr5 //QPEL_FILTER(src, stride) |
|
vdp2.h.bu.b vr7, vr9, vr5 |
|
vdp2.h.bu.b vr12, vr10, vr5 |
|
vdp2.h.bu.b vr13, vr11, vr5 |
|
vbsrl.v vr8, vr10, 1 |
|
vbsrl.v vr9, vr11, 1 |
|
vhaddw.d.h vr6 |
|
vhaddw.d.h vr7 |
|
vhaddw.d.h vr12 |
|
vhaddw.d.h vr13 |
|
vpickev.w vr6, vr7, vr6 |
|
vpickev.w vr12, vr13, vr12 |
|
vmulwev.w.h vr6, vr6, vr1 //QPEL_FILTER(src, stride) * wx |
|
vmulwev.w.h vr12, vr12, vr1 |
|
vadd.w vr6, vr6, vr2 |
|
vsra.w vr6, vr6, vr3 |
|
vadd.w vr6, vr6, vr4 |
|
vadd.w vr12, vr12, vr2 |
|
vsra.w vr12, vr12, vr3 |
|
vadd.w vr12, vr12, vr4 |
|
vssrani.h.w vr12, vr6, 0 |
|
vssrani.bu.h vr12, vr12, 0 |
|
fst.s f12, a0, 0 |
|
add.d a0, a0, a1 |
|
vstelm.w vr12, a0, 0, 1 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -2 |
|
bnez a4, .LOOP_V4 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v6_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
fld.d f6, a2, 0 |
|
fldx.d f7, a2, a3 |
|
fldx.d f8, a2, t0 |
|
add.d a2, a2, t1 |
|
fld.d f9, a2, 0 |
|
fldx.d f10, a2, a3 |
|
fldx.d f11, a2, t0 |
|
fldx.d f12, a2, t1 |
|
add.d a2, a2, t2 |
|
vilvl.b vr6, vr7, vr6 //transpose 8x6 to 3x16 |
|
vilvl.b vr7, vr9, vr8 |
|
vilvl.b vr8, vr11, vr10 |
|
vilvl.b vr9, vr13, vr12 |
|
vilvl.h vr10, vr7, vr6 |
|
vilvh.h vr11, vr7, vr6 |
|
vilvl.h vr12, vr9, vr8 |
|
vilvh.h vr13, vr9, vr8 |
|
vilvl.w vr6, vr12, vr10 |
|
vilvh.w vr7, vr12, vr10 |
|
vilvl.w vr8, vr13, vr11 |
|
.LOOP_V6: |
|
fld.d f13, a2, 0 |
|
add.d a2, a2, a3 |
|
vextrins.b vr6, vr13, 0x70 |
|
vextrins.b vr6, vr13, 0xf1 |
|
vextrins.b vr7, vr13, 0x72 |
|
vextrins.b vr7, vr13, 0xf3 |
|
vextrins.b vr8, vr13, 0x74 |
|
vextrins.b vr8, vr13, 0xf5 |
|
vdp2.h.bu.b vr10, vr6, vr5 //QPEL_FILTER(src, stride) |
|
vdp2.h.bu.b vr11, vr7, vr5 |
|
vdp2.h.bu.b vr12, vr8, vr5 |
|
vbsrl.v vr6, vr6, 1 |
|
vbsrl.v vr7, vr7, 1 |
|
vbsrl.v vr8, vr8, 1 |
|
vhaddw.d.h vr10 |
|
vhaddw.d.h vr11 |
|
vhaddw.d.h vr12 |
|
vpickev.w vr10, vr11, vr10 |
|
vpickev.w vr11, vr13, vr12 |
|
vmulwev.w.h vr10, vr10, vr1 //QPEL_FILTER(src, stride) * wx |
|
vmulwev.w.h vr11, vr11, vr1 |
|
vadd.w vr10, vr10, vr2 |
|
vadd.w vr11, vr11, vr2 |
|
vsra.w vr10, vr10, vr3 |
|
vsra.w vr11, vr11, vr3 |
|
vadd.w vr10, vr10, vr4 |
|
vadd.w vr11, vr11, vr4 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr11, vr11, 0 |
|
fst.s f11, a0, 0 |
|
vstelm.h vr11, a0, 4, 2 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_V6 |
|
endfunc |
|
|
|
// transpose 8x8b to 4x16b |
|
.macro TRANSPOSE8X8B_LSX in0, in1, in2, in3, in4, in5, in6, in7, \ |
|
out0, out1, out2, out3 |
|
vilvl.b \in0, \in1, \in0 |
|
vilvl.b \in1, \in3, \in2 |
|
vilvl.b \in2, \in5, \in4 |
|
vilvl.b \in3, \in7, \in6 |
|
vilvl.h \in4, \in1, \in0 |
|
vilvh.h \in5, \in1, \in0 |
|
vilvl.h \in6, \in3, \in2 |
|
vilvh.h \in7, \in3, \in2 |
|
vilvl.w \out0, \in6, \in4 |
|
vilvh.w \out1, \in6, \in4 |
|
vilvl.w \out2, \in7, \in5 |
|
vilvh.w \out3, \in7, \in5 |
|
.endm |
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_V8_LSX in0, in1, in2, in3, out0, out1, pos |
|
.if \pos == 0 |
|
vextrins.b \in0, vr13, 0x70 //insert the 8th load |
|
vextrins.b \in0, vr13, 0xf1 |
|
vextrins.b \in1, vr13, 0x72 |
|
vextrins.b \in1, vr13, 0xf3 |
|
vextrins.b \in2, vr13, 0x74 |
|
vextrins.b \in2, vr13, 0xf5 |
|
vextrins.b \in3, vr13, 0x76 |
|
vextrins.b \in3, vr13, 0xf7 |
|
.else// \pos == 8 |
|
vextrins.b \in0, vr13, 0x78 |
|
vextrins.b \in0, vr13, 0xf9 |
|
vextrins.b \in1, vr13, 0x7a |
|
vextrins.b \in1, vr13, 0xfb |
|
vextrins.b \in2, vr13, 0x7c |
|
vextrins.b \in2, vr13, 0xfd |
|
vextrins.b \in3, vr13, 0x7e |
|
vextrins.b \in3, vr13, 0xff |
|
.endif |
|
vdp2.h.bu.b \out0, \in0, vr5 //QPEL_FILTER(src, stride) |
|
vdp2.h.bu.b \out1, \in1, vr5 |
|
vdp2.h.bu.b vr12, \in2, vr5 |
|
vdp2.h.bu.b vr20, \in3, vr5 |
|
vbsrl.v \in0, \in0, 1 //Back up previous 7 loaded datas, |
|
vbsrl.v \in1, \in1, 1 //so just need to insert the 8th |
|
vbsrl.v \in2, \in2, 1 //load in the next loop. |
|
vbsrl.v \in3, \in3, 1 |
|
vhaddw.d.h \out0 |
|
vhaddw.d.h \out1 |
|
vhaddw.d.h vr12 |
|
vhaddw.d.h vr20 |
|
vpickev.w \out0, \out1, \out0 |
|
vpickev.w \out1, vr20, vr12 |
|
vmulwev.w.h \out0, \out0, vr1 //QPEL_FILTER(src, stride) * wx |
|
vmulwev.w.h \out1, \out1, vr1 |
|
vadd.w \out0, \out0, vr2 |
|
vadd.w \out1, \out1, vr2 |
|
vsra.w \out0, \out0, vr3 |
|
vsra.w \out1, \out1, vr3 |
|
vadd.w \out0, \out0, vr4 |
|
vadd.w \out1, \out1, vr4 |
|
.endm |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v8_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
fld.d f6, a2, 0 |
|
fldx.d f7, a2, a3 |
|
fldx.d f8, a2, t0 |
|
add.d a2, a2, t1 |
|
fld.d f9, a2, 0 |
|
fldx.d f10, a2, a3 |
|
fldx.d f11, a2, t0 |
|
fldx.d f12, a2, t1 |
|
add.d a2, a2, t2 |
|
TRANSPOSE8X8B_LSX vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13, \ |
|
vr6, vr7, vr8, vr9 |
|
.LOOP_V8: |
|
fld.d f13, a2, 0 //the 8th load |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_V8_LSX vr6, vr7, vr8, vr9, vr10, vr11, 0 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr11, vr11, 0 |
|
fst.d f11, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_V8 |
|
endfunc |
|
|
|
.macro PUT_HEVC_UNI_W_V8_LASX w |
|
fld.d f6, a2, 0 |
|
fldx.d f7, a2, a3 |
|
fldx.d f8, a2, t0 |
|
add.d a2, a2, t1 |
|
fld.d f9, a2, 0 |
|
fldx.d f10, a2, a3 |
|
fldx.d f11, a2, t0 |
|
fldx.d f12, a2, t1 |
|
add.d a2, a2, t2 |
|
TRANSPOSE8X8B_LSX vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13, \ |
|
vr6, vr7, vr8, vr9 |
|
xvpermi.q xr6, xr7, 0x02 |
|
xvpermi.q xr8, xr9, 0x02 |
|
.LOOP_V8_LASX_\w: |
|
fld.d f13, a2, 0 // 0 1 2 3 4 5 6 7 the 8th load |
|
add.d a2, a2, a3 |
|
vshuf4i.h vr13, vr13, 0xd8 |
|
vbsrl.v vr14, vr13, 4 |
|
xvpermi.q xr13, xr14, 0x02 //0 1 4 5 * * * * 2 3 6 7 * * * * |
|
xvextrins.b xr6, xr13, 0x70 //begin to insert the 8th load |
|
xvextrins.b xr6, xr13, 0xf1 |
|
xvextrins.b xr8, xr13, 0x72 |
|
xvextrins.b xr8, xr13, 0xf3 |
|
xvdp2.h.bu.b xr20, xr6, xr5 //QPEL_FILTER(src, stride) |
|
xvdp2.h.bu.b xr21, xr8, xr5 |
|
xvbsrl.v xr6, xr6, 1 |
|
xvbsrl.v xr8, xr8, 1 |
|
xvhaddw.d.h xr20 |
|
xvhaddw.d.h xr21 |
|
xvpickev.w xr20, xr21, xr20 |
|
xvpermi.d xr20, xr20, 0xd8 |
|
xvmulwev.w.h xr20, xr20, xr1 //QPEL_FILTER(src, stride) * wx |
|
xvadd.w xr20, xr20, xr2 |
|
xvsra.w xr20, xr20, xr3 |
|
xvadd.w xr10, xr20, xr4 |
|
xvpermi.q xr11, xr10, 0x01 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr11, vr11, 0 |
|
fst.d f11, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_V8_LASX_\w |
|
.endm |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v8_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
PUT_HEVC_UNI_W_V8_LASX 8 |
|
endfunc |
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_V16_LSX w |
|
vld vr6, a2, 0 |
|
vldx vr7, a2, a3 |
|
vldx vr8, a2, t0 |
|
add.d a2, a2, t1 |
|
vld vr9, a2, 0 |
|
vldx vr10, a2, a3 |
|
vldx vr11, a2, t0 |
|
vldx vr12, a2, t1 |
|
add.d a2, a2, t2 |
|
.if \w > 8 |
|
vilvh.d vr14, vr14, vr6 |
|
vilvh.d vr15, vr15, vr7 |
|
vilvh.d vr16, vr16, vr8 |
|
vilvh.d vr17, vr17, vr9 |
|
vilvh.d vr18, vr18, vr10 |
|
vilvh.d vr19, vr19, vr11 |
|
vilvh.d vr20, vr20, vr12 |
|
.endif |
|
TRANSPOSE8X8B_LSX vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13, \ |
|
vr6, vr7, vr8, vr9 |
|
.if \w > 8 |
|
TRANSPOSE8X8B_LSX vr14, vr15, vr16, vr17, vr18, vr19, vr20, vr21, \ |
|
vr14, vr15, vr16, vr17 |
|
.endif |
|
.LOOP_HORI_16_\w: |
|
vld vr13, a2, 0 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_V8_LSX vr6, vr7, vr8, vr9, vr10, vr11, 0 |
|
.if \w > 8 |
|
PUT_HEVC_QPEL_UNI_W_V8_LSX vr14, vr15, vr16, vr17, vr18, vr19, 8 |
|
.endif |
|
vssrani.h.w vr11, vr10, 0 |
|
.if \w > 8 |
|
vssrani.h.w vr19, vr18, 0 |
|
vssrani.bu.h vr19, vr11, 0 |
|
.else |
|
vssrani.bu.h vr11, vr11, 0 |
|
.endif |
|
.if \w == 8 |
|
fst.d f11, a0, 0 |
|
.elseif \w == 12 |
|
fst.d f19, a0, 0 |
|
vstelm.w vr19, a0, 8, 2 |
|
.else |
|
vst vr19, a0, 0 |
|
.endif |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_HORI_16_\w |
|
.endm |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v16_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 16 |
|
endfunc |
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_V16_LASX w |
|
vld vr6, a2, 0 |
|
vldx vr7, a2, a3 |
|
vldx vr8, a2, t0 |
|
add.d a2, a2, t1 |
|
vld vr9, a2, 0 |
|
vldx vr10, a2, a3 |
|
vldx vr11, a2, t0 |
|
vldx vr12, a2, t1 |
|
add.d a2, a2, t2 |
|
xvpermi.q xr6, xr10, 0x02 //pack and transpose the 8x16 to 4x32 begin |
|
xvpermi.q xr7, xr11, 0x02 |
|
xvpermi.q xr8, xr12, 0x02 |
|
xvpermi.q xr9, xr13, 0x02 |
|
xvilvl.b xr14, xr7, xr6 //0 2 |
|
xvilvh.b xr15, xr7, xr6 //1 3 |
|
xvilvl.b xr16, xr9, xr8 //0 2 |
|
xvilvh.b xr17, xr9, xr8 //1 3 |
|
xvpermi.d xr14, xr14, 0xd8 |
|
xvpermi.d xr15, xr15, 0xd8 |
|
xvpermi.d xr16, xr16, 0xd8 |
|
xvpermi.d xr17, xr17, 0xd8 |
|
xvilvl.h xr6, xr16, xr14 |
|
xvilvh.h xr7, xr16, xr14 |
|
xvilvl.h xr8, xr17, xr15 |
|
xvilvh.h xr9, xr17, xr15 |
|
xvilvl.w xr14, xr7, xr6 //0 1 4 5 |
|
xvilvh.w xr15, xr7, xr6 //2 3 6 7 |
|
xvilvl.w xr16, xr9, xr8 //8 9 12 13 |
|
xvilvh.w xr17, xr9, xr8 //10 11 14 15 end |
|
.LOOP_HORI_16_LASX_\w: |
|
vld vr13, a2, 0 //the 8th load |
|
add.d a2, a2, a3 |
|
vshuf4i.w vr13, vr13, 0xd8 |
|
vbsrl.v vr12, vr13, 8 |
|
xvpermi.q xr13, xr12, 0x02 |
|
xvextrins.b xr14, xr13, 0x70 //inset the 8th load |
|
xvextrins.b xr14, xr13, 0xf1 |
|
xvextrins.b xr15, xr13, 0x72 |
|
xvextrins.b xr15, xr13, 0xf3 |
|
xvextrins.b xr16, xr13, 0x74 |
|
xvextrins.b xr16, xr13, 0xf5 |
|
xvextrins.b xr17, xr13, 0x76 |
|
xvextrins.b xr17, xr13, 0xf7 |
|
xvdp2.h.bu.b xr6, xr14, xr5 //QPEL_FILTER(src, stride) |
|
xvdp2.h.bu.b xr7, xr15, xr5 |
|
xvdp2.h.bu.b xr8, xr16, xr5 |
|
xvdp2.h.bu.b xr9, xr17, xr5 |
|
xvhaddw.d.h xr6 |
|
xvhaddw.d.h xr7 |
|
xvhaddw.d.h xr8 |
|
xvhaddw.d.h xr9 |
|
xvbsrl.v xr14, xr14, 1 //Back up previous 7 loaded datas, |
|
xvbsrl.v xr15, xr15, 1 //so just need to insert the 8th |
|
xvbsrl.v xr16, xr16, 1 //load in next loop. |
|
xvbsrl.v xr17, xr17, 1 |
|
xvpickev.w xr6, xr7, xr6 //0 1 2 3 4 5 6 7 |
|
xvpickev.w xr7, xr9, xr8 //8 9 10 11 12 13 14 15 |
|
xvmulwev.w.h xr6, xr6, xr1 //QPEL_FILTER(src, stride) * wx |
|
xvmulwev.w.h xr7, xr7, xr1 |
|
xvadd.w xr6, xr6, xr2 |
|
xvadd.w xr7, xr7, xr2 |
|
xvsra.w xr6, xr6, xr3 |
|
xvsra.w xr7, xr7, xr3 |
|
xvadd.w xr6, xr6, xr4 |
|
xvadd.w xr7, xr7, xr4 |
|
xvssrani.h.w xr7, xr6, 0 //0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 |
|
xvpermi.q xr6, xr7, 0x01 |
|
vssrani.bu.h vr6, vr7, 0 |
|
vshuf4i.w vr6, vr6, 0xd8 |
|
.if \w == 12 |
|
fst.d f6, a0, 0 |
|
vstelm.w vr6, a0, 8, 2 |
|
.else |
|
vst vr6, a0, 0 |
|
.endif |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_HORI_16_LASX_\w |
|
.endm |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v16_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v12_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v12_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v24_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
addi.d t4, a0, 0 //save dst |
|
addi.d t5, a2, 0 //save src |
|
addi.d t6, a4, 0 |
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 24 |
|
addi.d a0, t4, 16 |
|
addi.d a2, t5, 16 |
|
addi.d a4, t6, 0 |
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 8 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v24_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
addi.d t4, a0, 0 //save dst |
|
addi.d t5, a2, 0 //save src |
|
addi.d t6, a4, 0 |
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 24 |
|
addi.d a0, t4, 16 |
|
addi.d a2, t5, 16 |
|
addi.d a4, t6, 0 |
|
PUT_HEVC_UNI_W_V8_LASX 24 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v32_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
addi.d t3, zero, 2 |
|
addi.d t4, a0, 0 //save dst |
|
addi.d t5, a2, 0 //save src |
|
addi.d t6, a4, 0 |
|
.LOOP_V32: |
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 32 |
|
addi.d t3, t3, -1 |
|
addi.d a0, t4, 16 |
|
addi.d a2, t5, 16 |
|
addi.d a4, t6, 0 |
|
bnez t3, .LOOP_V32 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v32_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
addi.d t3, zero, 2 |
|
addi.d t4, a0, 0 //save dst |
|
addi.d t5, a2, 0 //save src |
|
addi.d t6, a4, 0 |
|
.LOOP_V32_LASX: |
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 32 |
|
addi.d t3, t3, -1 |
|
addi.d a0, t4, 16 |
|
addi.d a2, t5, 16 |
|
addi.d a4, t6, 0 |
|
bnez t3, .LOOP_V32_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v48_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
addi.d t3, zero, 3 |
|
addi.d t4, a0, 0 //save dst |
|
addi.d t5, a2, 0 //save src |
|
addi.d t6, a4, 0 |
|
.LOOP_V48: |
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 48 |
|
addi.d t3, t3, -1 |
|
addi.d a0, t4, 16 |
|
addi.d t4, t4, 16 |
|
addi.d a2, t5, 16 |
|
addi.d t5, t5, 16 |
|
addi.d a4, t6, 0 |
|
bnez t3, .LOOP_V48 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v48_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
addi.d t3, zero, 3 |
|
addi.d t4, a0, 0 //save dst |
|
addi.d t5, a2, 0 //save src |
|
addi.d t6, a4, 0 |
|
.LOOP_V48_LASX: |
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 48 |
|
addi.d t3, t3, -1 |
|
addi.d a0, t4, 16 |
|
addi.d t4, t4, 16 |
|
addi.d a2, t5, 16 |
|
addi.d t5, t5, 16 |
|
addi.d a4, t6, 0 |
|
bnez t3, .LOOP_V48_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v64_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
addi.d t3, zero, 4 |
|
addi.d t4, a0, 0 //save dst |
|
addi.d t5, a2, 0 //save src |
|
addi.d t6, a4, 0 |
|
.LOOP_V64: |
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 64 |
|
addi.d t3, t3, -1 |
|
addi.d a0, t4, 16 |
|
addi.d t4, t4, 16 |
|
addi.d a2, t5, 16 |
|
addi.d t5, t5, 16 |
|
addi.d a4, t6, 0 |
|
bnez t3, .LOOP_V64 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v64_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
add.d t2, t1, a3 //stride * 4 |
|
sub.d a2, a2, t1 //src -= stride*3 |
|
addi.d t3, zero, 4 |
|
addi.d t4, a0, 0 //save dst |
|
addi.d t5, a2, 0 //save src |
|
addi.d t6, a4, 0 |
|
.LOOP_V64_LASX: |
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 64 |
|
addi.d t3, t3, -1 |
|
addi.d a0, t4, 16 |
|
addi.d t4, t4, 16 |
|
addi.d a2, t5, 16 |
|
addi.d t5, t5, 16 |
|
addi.d a4, t6, 0 |
|
bnez t3, .LOOP_V64_LASX |
|
endfunc |
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_H8_LSX in0, out0, out1 |
|
vbsrl.v vr7, \in0, 1 |
|
vbsrl.v vr8, \in0, 2 |
|
vbsrl.v vr9, \in0, 3 |
|
vbsrl.v vr10, \in0, 4 |
|
vbsrl.v vr11, \in0, 5 |
|
vbsrl.v vr12, \in0, 6 |
|
vbsrl.v vr13, \in0, 7 |
|
vilvl.d vr6, vr7, \in0 |
|
vilvl.d vr7, vr9, vr8 |
|
vilvl.d vr8, vr11, vr10 |
|
vilvl.d vr9, vr13, vr12 |
|
vdp2.h.bu.b vr10, vr6, vr5 |
|
vdp2.h.bu.b vr11, vr7, vr5 |
|
vdp2.h.bu.b vr12, vr8, vr5 |
|
vdp2.h.bu.b vr13, vr9, vr5 |
|
vhaddw.d.h vr10 |
|
vhaddw.d.h vr11 |
|
vhaddw.d.h vr12 |
|
vhaddw.d.h vr13 |
|
vpickev.w vr10, vr11, vr10 |
|
vpickev.w vr11, vr13, vr12 |
|
vmulwev.w.h vr10, vr10, vr1 |
|
vmulwev.w.h vr11, vr11, vr1 |
|
vadd.w vr10, vr10, vr2 |
|
vadd.w vr11, vr11, vr2 |
|
vsra.w vr10, vr10, vr3 |
|
vsra.w vr11, vr11, vr3 |
|
vadd.w \out0, vr10, vr4 |
|
vadd.w \out1, vr11, vr4 |
|
.endm |
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_H8_LASX in0, out0 |
|
xvbsrl.v xr7, \in0, 4 |
|
xvpermi.q xr7, \in0, 0x20 |
|
xvbsrl.v xr8, xr7, 1 |
|
xvbsrl.v xr9, xr7, 2 |
|
xvbsrl.v xr10, xr7, 3 |
|
xvpackev.d xr7, xr8, xr7 |
|
xvpackev.d xr8, xr10, xr9 |
|
xvdp2.h.bu.b xr10, xr7, xr5 |
|
xvdp2.h.bu.b xr11, xr8, xr5 |
|
xvhaddw.d.h xr10 |
|
xvhaddw.d.h xr11 |
|
xvpickev.w xr10, xr11, xr10 |
|
xvmulwev.w.h xr10, xr10, xr1 |
|
xvadd.w xr10, xr10, xr2 |
|
xvsra.w xr10, xr10, xr3 |
|
xvadd.w \out0, xr10, xr4 |
|
.endm |
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_H16_LASX in0, out0 |
|
xvpermi.d xr6, \in0, 0x94 |
|
xvbsrl.v xr7, xr6, 1 |
|
xvbsrl.v xr8, xr6, 2 |
|
xvbsrl.v xr9, xr6, 3 |
|
xvbsrl.v xr10, xr6, 4 |
|
xvbsrl.v xr11, xr6, 5 |
|
xvbsrl.v xr12, xr6, 6 |
|
xvbsrl.v xr13, xr6, 7 |
|
xvpackev.d xr6, xr7, xr6 |
|
xvpackev.d xr7, xr9, xr8 |
|
xvpackev.d xr8, xr11, xr10 |
|
xvpackev.d xr9, xr13, xr12 |
|
xvdp2.h.bu.b xr10, xr6, xr5 |
|
xvdp2.h.bu.b xr11, xr7, xr5 |
|
xvdp2.h.bu.b xr12, xr8, xr5 |
|
xvdp2.h.bu.b xr13, xr9, xr5 |
|
xvhaddw.d.h xr10 |
|
xvhaddw.d.h xr11 |
|
xvhaddw.d.h xr12 |
|
xvhaddw.d.h xr13 |
|
xvpickev.w xr10, xr11, xr10 |
|
xvpickev.w xr11, xr13, xr12 |
|
xvmulwev.w.h xr10, xr10, xr1 |
|
xvmulwev.w.h xr11, xr11, xr1 |
|
xvadd.w xr10, xr10, xr2 |
|
xvadd.w xr11, xr11, xr2 |
|
xvsra.w xr10, xr10, xr3 |
|
xvsra.w xr11, xr11, xr3 |
|
xvadd.w xr10, xr10, xr4 |
|
xvadd.w xr11, xr11, xr4 |
|
xvssrani.h.w xr11, xr10, 0 |
|
xvpermi.q \out0, xr11, 0x01 |
|
xvssrani.bu.h \out0, xr11, 0 |
|
.endm |
|
|
|
/* |
|
* void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, |
|
* const uint8_t *_src, ptrdiff_t _srcstride, |
|
* int height, int denom, int wx, int ox, |
|
* intptr_t mx, intptr_t my, int width) |
|
*/ |
|
function ff_hevc_put_hevc_qpel_uni_w_h4_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H4: |
|
vld vr18, a2, 0 |
|
vldx vr19, a2, a3 |
|
alsl.d a2, a3, a2, 1 |
|
vbsrl.v vr6, vr18, 1 |
|
vbsrl.v vr7, vr18, 2 |
|
vbsrl.v vr8, vr18, 3 |
|
vbsrl.v vr9, vr19, 1 |
|
vbsrl.v vr10, vr19, 2 |
|
vbsrl.v vr11, vr19, 3 |
|
vilvl.d vr6, vr6, vr18 |
|
vilvl.d vr7, vr8, vr7 |
|
vilvl.d vr8, vr9, vr19 |
|
vilvl.d vr9, vr11, vr10 |
|
vdp2.h.bu.b vr10, vr6, vr5 |
|
vdp2.h.bu.b vr11, vr7, vr5 |
|
vdp2.h.bu.b vr12, vr8, vr5 |
|
vdp2.h.bu.b vr13, vr9, vr5 |
|
vhaddw.d.h vr10 |
|
vhaddw.d.h vr11 |
|
vhaddw.d.h vr12 |
|
vhaddw.d.h vr13 |
|
vpickev.w vr10, vr11, vr10 |
|
vpickev.w vr11, vr13, vr12 |
|
vmulwev.w.h vr10, vr10, vr1 |
|
vmulwev.w.h vr11, vr11, vr1 |
|
vadd.w vr10, vr10, vr2 |
|
vadd.w vr11, vr11, vr2 |
|
vsra.w vr10, vr10, vr3 |
|
vsra.w vr11, vr11, vr3 |
|
vadd.w vr10, vr10, vr4 |
|
vadd.w vr11, vr11, vr4 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr11, vr11, 0 |
|
fst.s f11, a0, 0 |
|
vbsrl.v vr11, vr11, 4 |
|
fstx.s f11, a0, a1 |
|
alsl.d a0, a1, a0, 1 |
|
addi.d a4, a4, -2 |
|
bnez a4, .LOOP_H4 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h4_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H4_LASX: |
|
vld vr18, a2, 0 |
|
vldx vr19, a2, a3 |
|
alsl.d a2, a3, a2, 1 |
|
xvpermi.q xr18, xr19, 0x02 |
|
xvbsrl.v xr6, xr18, 1 |
|
xvbsrl.v xr7, xr18, 2 |
|
xvbsrl.v xr8, xr18, 3 |
|
xvpackev.d xr6, xr6, xr18 |
|
xvpackev.d xr7, xr8, xr7 |
|
xvdp2.h.bu.b xr10, xr6, xr5 |
|
xvdp2.h.bu.b xr11, xr7, xr5 |
|
xvhaddw.d.h xr10 |
|
xvhaddw.d.h xr11 |
|
xvpickev.w xr10, xr11, xr10 |
|
xvmulwev.w.h xr10, xr10, xr1 |
|
xvadd.w xr10, xr10, xr2 |
|
xvsra.w xr10, xr10, xr3 |
|
xvadd.w xr10, xr10, xr4 |
|
xvpermi.q xr11, xr10, 0x01 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr11, vr11, 0 |
|
fst.s f11, a0, 0 |
|
vbsrl.v vr11, vr11, 4 |
|
fstx.s f11, a0, a1 |
|
alsl.d a0, a1, a0, 1 |
|
addi.d a4, a4, -2 |
|
bnez a4, .LOOP_H4_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h6_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H6: |
|
vld vr6, a2, 0 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr10, vr11 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr11, vr11, 0 |
|
fst.s f11, a0, 0 |
|
vstelm.h vr11, a0, 4, 2 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H6 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h6_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H6_LASX: |
|
vld vr6, a2, 0 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H8_LASX xr6, xr10 |
|
xvpermi.q xr11, xr10, 0x01 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr11, vr11, 0 |
|
fst.s f11, a0, 0 |
|
vstelm.h vr11, a0, 4, 2 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H6_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h8_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H8: |
|
vld vr6, a2, 0 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr10, vr11 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr11, vr11, 0 |
|
fst.d f11, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H8 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h8_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H8_LASX: |
|
vld vr6, a2, 0 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H8_LASX xr6, xr10 |
|
xvpermi.q xr11, xr10, 0x01 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr11, vr11, 0 |
|
fst.d f11, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H8_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h12_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H12: |
|
vld vr6, a2, 0 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr14, vr15 |
|
vld vr6, a2, 8 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr16, vr17 |
|
add.d a2, a2, a3 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
fst.d f17, a0, 0 |
|
vbsrl.v vr17, vr17, 8 |
|
fst.s f17, a0, 8 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h12_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H12_LASX: |
|
xvld xr6, a2, 0 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr6, xr14 |
|
fst.d f14, a0, 0 |
|
vstelm.w vr14, a0, 8, 2 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H12_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h16_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H16: |
|
vld vr6, a2, 0 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr14, vr15 |
|
vld vr6, a2, 8 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr16, vr17 |
|
add.d a2, a2, a3 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h16_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H16_LASX: |
|
xvld xr6, a2, 0 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr6, xr10 |
|
vst vr10, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H16_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h24_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H24: |
|
vld vr18, a2, 0 |
|
vld vr19, a2, 16 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr14, vr15 |
|
vshuf4i.d vr18, vr19, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 0 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr14, vr15 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.bu.h vr15, vr15, 0 |
|
fst.d f15, a0, 16 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H24 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h24_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H24_LASX: |
|
xvld xr18, a2, 0 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr20 |
|
xvpermi.q xr19, xr18, 0x01 |
|
vst vr20, a0, 0 |
|
PUT_HEVC_QPEL_UNI_W_H8_LASX xr19, xr20 |
|
xvpermi.q xr21, xr20, 0x01 |
|
vssrani.h.w vr21, vr20, 0 |
|
vssrani.bu.h vr21, vr21, 0 |
|
fst.d f21, a0, 16 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H24_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h32_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H32: |
|
vld vr18, a2, 0 |
|
vld vr19, a2, 16 |
|
vld vr20, a2, 32 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr14, vr15 |
|
vshuf4i.d vr18, vr19, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 0 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr14, vr15 |
|
vshuf4i.d vr19, vr20, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 16 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H32 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h32_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H32_LASX: |
|
xvld xr18, a2, 0 |
|
xvld xr19, a2, 16 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr20 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr19, xr21 |
|
xvpermi.q xr20, xr21, 0x02 |
|
xvst xr20, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H32_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h48_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H48: |
|
vld vr18, a2, 0 |
|
vld vr19, a2, 16 |
|
vld vr20, a2, 32 |
|
vld vr21, a2, 48 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr14, vr15 |
|
vshuf4i.d vr18, vr19, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 0 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr14, vr15 |
|
vshuf4i.d vr19, vr20, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 16 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr20, vr14, vr15 |
|
vshuf4i.d vr20, vr21, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr20, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 32 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H48 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h48_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H48_LASX: |
|
xvld xr18, a2, 0 |
|
xvld xr19, a2, 32 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr20 |
|
xvpermi.q xr18, xr19, 0x03 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr21 |
|
xvpermi.q xr20, xr21, 0x02 |
|
xvst xr20, a0, 0 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr19, xr20 |
|
vst vr20, a0, 32 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H48_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h64_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H64: |
|
vld vr18, a2, 0 |
|
vld vr19, a2, 16 |
|
vld vr20, a2, 32 |
|
vld vr21, a2, 48 |
|
vld vr22, a2, 64 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr14, vr15 |
|
vshuf4i.d vr18, vr19, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 0 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr14, vr15 |
|
vshuf4i.d vr19, vr20, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 16 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr20, vr14, vr15 |
|
vshuf4i.d vr20, vr21, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr20, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 32 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr21, vr14, vr15 |
|
vshuf4i.d vr21, vr22, 0x09 |
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr21, vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, 48 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H64 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
xvreplve0.q xr5, xr5 |
|
addi.d a2, a2, -3 //src -= 3 |
|
.LOOP_H64_LASX: |
|
xvld xr18, a2, 0 |
|
xvld xr19, a2, 32 |
|
xvld xr20, a2, 64 |
|
add.d a2, a2, a3 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr21 |
|
xvpermi.q xr18, xr19, 0x03 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr22 |
|
xvpermi.q xr21, xr22, 0x02 |
|
xvst xr21, a0, 0 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr19, xr21 |
|
xvpermi.q xr19, xr20, 0x03 |
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr19, xr22 |
|
xvpermi.q xr21, xr22, 0x02 |
|
xvst xr21, a0, 32 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_H64_LASX |
|
endfunc |
|
|
|
const shufb |
|
.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6 //mask for epel_uni_w(128-bit) |
|
.byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10 //mask for epel_uni_w(256-bit) |
|
.byte 0,1,2,3, 4,5,6,7 ,1,2,3,4, 5,6,7,8 //mask for qpel_uni_h4 |
|
.byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8 //mask for qpel_uni_h/v6/8... |
|
.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6, 4,5,6,7, 5,6,7,8, 6,7,8,9, 7,8,9,10 //epel_uni_w_h16/24/32/48/64 |
|
.byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8, 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8 //mask for bi_epel_h16/24/32/48/64 |
|
endconst |
|
|
|
.macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w |
|
fld.d f7, a2, 0 // start to load src |
|
fldx.d f8, a2, a3 |
|
alsl.d a2, a3, a2, 1 |
|
fld.d f9, a2, 0 |
|
vshuf.b vr7, vr7, vr7, vr0 // 0123 1234 2345 3456 |
|
vshuf.b vr8, vr8, vr8, vr0 |
|
vshuf.b vr9, vr9, vr9, vr0 |
|
vdp2.h.bu.b vr10, vr7, vr5 // EPEL_FILTER(src, 1) |
|
vdp2.h.bu.b vr11, vr8, vr5 |
|
vdp2.h.bu.b vr12, vr9, vr5 |
|
vhaddw.w.h vr10, vr10, vr10 // tmp[0/1/2/3] |
|
vhaddw.w.h vr11, vr11, vr11 // vr10,vr11,vr12 corresponding to EPEL_EXTRA |
|
vhaddw.w.h vr12, vr12, vr12 |
|
.LOOP_HV4_\w: |
|
add.d a2, a2, a3 |
|
fld.d f14, a2, 0 // height loop begin |
|
vshuf.b vr14, vr14, vr14, vr0 |
|
vdp2.h.bu.b vr13, vr14, vr5 |
|
vhaddw.w.h vr13, vr13, vr13 |
|
vmul.w vr14, vr10, vr16 // EPEL_FILTER(tmp, MAX_PB_SIZE) |
|
vmadd.w vr14, vr11, vr17 |
|
vmadd.w vr14, vr12, vr18 |
|
vmadd.w vr14, vr13, vr19 |
|
vaddi.wu vr10, vr11, 0 //back up previous value |
|
vaddi.wu vr11, vr12, 0 |
|
vaddi.wu vr12, vr13, 0 |
|
vsrai.w vr14, vr14, 6 // >> 6 |
|
vmul.w vr14, vr14, vr1 // * wx |
|
vadd.w vr14, vr14, vr2 // + offset |
|
vsra.w vr14, vr14, vr3 // >> shift |
|
vadd.w vr14, vr14, vr4 // + ox |
|
vssrani.h.w vr14, vr14, 0 |
|
vssrani.bu.h vr14, vr14, 0 // clip |
|
fst.s f14, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_HV4_\w |
|
.endm |
|
|
|
/* |
|
* void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, |
|
* const uint8_t *_src, ptrdiff_t _srcstride, |
|
* int height, int denom, int wx, int ox, |
|
* intptr_t mx, intptr_t my, int width) |
|
*/ |
|
function ff_hevc_put_hevc_epel_uni_w_hv4_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
vreplvei.w vr5, vr5, 0 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vreplvei.w vr16, vr6, 0 |
|
vreplvei.w vr17, vr6, 1 |
|
vreplvei.w vr18, vr6, 2 |
|
vreplvei.w vr19, vr6, 3 |
|
la.local t1, shufb |
|
vld vr0, t1, 0 |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
PUT_HEVC_EPEL_UNI_W_HV4_LSX 4 |
|
endfunc |
|
|
|
.macro PUT_HEVC_EPEL_UNI_W_HV8_LSX w |
|
vld vr7, a2, 0 // start to load src |
|
vldx vr8, a2, a3 |
|
alsl.d a2, a3, a2, 1 |
|
vld vr9, a2, 0 |
|
vshuf.b vr10, vr7, vr7, vr0 // 0123 1234 2345 3456 |
|
vshuf.b vr11, vr8, vr8, vr0 |
|
vshuf.b vr12, vr9, vr9, vr0 |
|
vshuf.b vr7, vr7, vr7, vr22// 4567 5678 6789 78910 |
|
vshuf.b vr8, vr8, vr8, vr22 |
|
vshuf.b vr9, vr9, vr9, vr22 |
|
vdp2.h.bu.b vr13, vr10, vr5 // EPEL_FILTER(src, 1) |
|
vdp2.h.bu.b vr14, vr11, vr5 |
|
vdp2.h.bu.b vr15, vr12, vr5 |
|
vdp2.h.bu.b vr23, vr7, vr5 |
|
vdp2.h.bu.b vr20, vr8, vr5 |
|
vdp2.h.bu.b vr21, vr9, vr5 |
|
vhaddw.w.h vr7, vr13, vr13 |
|
vhaddw.w.h vr8, vr14, vr14 |
|
vhaddw.w.h vr9, vr15, vr15 |
|
vhaddw.w.h vr10, vr23, vr23 |
|
vhaddw.w.h vr11, vr20, vr20 |
|
vhaddw.w.h vr12, vr21, vr21 |
|
.LOOP_HV8_HORI_\w: |
|
add.d a2, a2, a3 |
|
vld vr15, a2, 0 |
|
vshuf.b vr23, vr15, vr15, vr0 |
|
vshuf.b vr15, vr15, vr15, vr22 |
|
vdp2.h.bu.b vr13, vr23, vr5 |
|
vdp2.h.bu.b vr14, vr15, vr5 |
|
vhaddw.w.h vr13, vr13, vr13 //789--13 |
|
vhaddw.w.h vr14, vr14, vr14 //101112--14 |
|
vmul.w vr15, vr7, vr16 //EPEL_FILTER(tmp, MAX_PB_SIZE) |
|
vmadd.w vr15, vr8, vr17 |
|
vmadd.w vr15, vr9, vr18 |
|
vmadd.w vr15, vr13, vr19 |
|
vmul.w vr20, vr10, vr16 |
|
vmadd.w vr20, vr11, vr17 |
|
vmadd.w vr20, vr12, vr18 |
|
vmadd.w vr20, vr14, vr19 |
|
vaddi.wu vr7, vr8, 0 //back up previous value |
|
vaddi.wu vr8, vr9, 0 |
|
vaddi.wu vr9, vr13, 0 |
|
vaddi.wu vr10, vr11, 0 |
|
vaddi.wu vr11, vr12, 0 |
|
vaddi.wu vr12, vr14, 0 |
|
vsrai.w vr15, vr15, 6 // >> 6 |
|
vsrai.w vr20, vr20, 6 |
|
vmul.w vr15, vr15, vr1 // * wx |
|
vmul.w vr20, vr20, vr1 |
|
vadd.w vr15, vr15, vr2 // + offset |
|
vadd.w vr20, vr20, vr2 |
|
vsra.w vr15, vr15, vr3 // >> shift |
|
vsra.w vr20, vr20, vr3 |
|
vadd.w vr15, vr15, vr4 // + ox |
|
vadd.w vr20, vr20, vr4 |
|
vssrani.h.w vr20, vr15, 0 |
|
vssrani.bu.h vr20, vr20, 0 |
|
.if \w > 6 |
|
fst.d f20, a0, 0 |
|
.else |
|
fst.s f20, a0, 0 |
|
vstelm.h vr20, a0, 4, 2 |
|
.endif |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_HV8_HORI_\w |
|
.endm |
|
|
|
.macro PUT_HEVC_EPEL_UNI_W_HV8_LASX w |
|
vld vr7, a2, 0 // start to load src |
|
vldx vr8, a2, a3 |
|
alsl.d a2, a3, a2, 1 |
|
vld vr9, a2, 0 |
|
xvreplve0.q xr7, xr7 |
|
xvreplve0.q xr8, xr8 |
|
xvreplve0.q xr9, xr9 |
|
xvshuf.b xr10, xr7, xr7, xr0 // 0123 1234 2345 3456 |
|
xvshuf.b xr11, xr8, xr8, xr0 |
|
xvshuf.b xr12, xr9, xr9, xr0 |
|
xvdp2.h.bu.b xr13, xr10, xr5 // EPEL_FILTER(src, 1) |
|
xvdp2.h.bu.b xr14, xr11, xr5 |
|
xvdp2.h.bu.b xr15, xr12, xr5 |
|
xvhaddw.w.h xr7, xr13, xr13 |
|
xvhaddw.w.h xr8, xr14, xr14 |
|
xvhaddw.w.h xr9, xr15, xr15 |
|
.LOOP_HV8_HORI_LASX_\w: |
|
add.d a2, a2, a3 |
|
vld vr15, a2, 0 |
|
xvreplve0.q xr15, xr15 |
|
xvshuf.b xr23, xr15, xr15, xr0 |
|
xvdp2.h.bu.b xr10, xr23, xr5 |
|
xvhaddw.w.h xr10, xr10, xr10 |
|
xvmul.w xr15, xr7, xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE) |
|
xvmadd.w xr15, xr8, xr17 |
|
xvmadd.w xr15, xr9, xr18 |
|
xvmadd.w xr15, xr10, xr19 |
|
xvaddi.wu xr7, xr8, 0 //back up previous value |
|
xvaddi.wu xr8, xr9, 0 |
|
xvaddi.wu xr9, xr10, 0 |
|
xvsrai.w xr15, xr15, 6 // >> 6 |
|
xvmul.w xr15, xr15, xr1 // * wx |
|
xvadd.w xr15, xr15, xr2 // + offset |
|
xvsra.w xr15, xr15, xr3 // >> shift |
|
xvadd.w xr15, xr15, xr4 // + ox |
|
xvpermi.q xr20, xr15, 0x01 |
|
vssrani.h.w vr20, vr15, 0 |
|
vssrani.bu.h vr20, vr20, 0 |
|
.if \w > 6 |
|
fst.d f20, a0, 0 |
|
.else |
|
fst.s f20, a0, 0 |
|
vstelm.h vr20, a0, 4, 2 |
|
.endif |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_HV8_HORI_LASX_\w |
|
.endm |
|
|
|
.macro PUT_HEVC_EPEL_UNI_W_HV16_LASX w |
|
xvld xr7, a2, 0 // start to load src |
|
xvldx xr8, a2, a3 |
|
alsl.d a2, a3, a2, 1 |
|
xvld xr9, a2, 0 |
|
xvpermi.d xr10, xr7, 0x09 //8..18 |
|
xvpermi.d xr11, xr8, 0x09 |
|
xvpermi.d xr12, xr9, 0x09 |
|
xvreplve0.q xr7, xr7 |
|
xvreplve0.q xr8, xr8 |
|
xvreplve0.q xr9, xr9 |
|
xvshuf.b xr13, xr7, xr7, xr0 // 0123 1234 2345 3456 |
|
xvshuf.b xr14, xr8, xr8, xr0 |
|
xvshuf.b xr15, xr9, xr9, xr0 |
|
xvdp2.h.bu.b xr20, xr13, xr5 // EPEL_FILTER(src, 1) |
|
xvdp2.h.bu.b xr21, xr14, xr5 |
|
xvdp2.h.bu.b xr22, xr15, xr5 |
|
xvhaddw.w.h xr7, xr20, xr20 |
|
xvhaddw.w.h xr8, xr21, xr21 |
|
xvhaddw.w.h xr9, xr22, xr22 |
|
xvreplve0.q xr10, xr10 |
|
xvreplve0.q xr11, xr11 |
|
xvreplve0.q xr12, xr12 |
|
xvshuf.b xr13, xr10, xr10, xr0 |
|
xvshuf.b xr14, xr11, xr11, xr0 |
|
xvshuf.b xr15, xr12, xr12, xr0 |
|
xvdp2.h.bu.b xr20, xr13, xr5 |
|
xvdp2.h.bu.b xr21, xr14, xr5 |
|
xvdp2.h.bu.b xr22, xr15, xr5 |
|
xvhaddw.w.h xr10, xr20, xr20 |
|
xvhaddw.w.h xr11, xr21, xr21 |
|
xvhaddw.w.h xr12, xr22, xr22 |
|
.LOOP_HV16_HORI_LASX_\w: |
|
add.d a2, a2, a3 |
|
xvld xr15, a2, 0 |
|
xvpermi.d xr20, xr15, 0x09 //8...18 |
|
xvreplve0.q xr15, xr15 |
|
xvreplve0.q xr20, xr20 |
|
xvshuf.b xr21, xr15, xr15, xr0 |
|
xvshuf.b xr22, xr20, xr20, xr0 |
|
xvdp2.h.bu.b xr13, xr21, xr5 |
|
xvdp2.h.bu.b xr14, xr22, xr5 |
|
xvhaddw.w.h xr13, xr13, xr13 |
|
xvhaddw.w.h xr14, xr14, xr14 |
|
xvmul.w xr15, xr7, xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE) |
|
xvmadd.w xr15, xr8, xr17 |
|
xvmadd.w xr15, xr9, xr18 |
|
xvmadd.w xr15, xr13, xr19 |
|
xvmul.w xr20, xr10, xr16 |
|
xvmadd.w xr20, xr11, xr17 |
|
xvmadd.w xr20, xr12, xr18 |
|
xvmadd.w xr20, xr14, xr19 |
|
xvaddi.wu xr7, xr8, 0 //back up previous value |
|
xvaddi.wu xr8, xr9, 0 |
|
xvaddi.wu xr9, xr13, 0 |
|
xvaddi.wu xr10, xr11, 0 |
|
xvaddi.wu xr11, xr12, 0 |
|
xvaddi.wu xr12, xr14, 0 |
|
xvsrai.w xr15, xr15, 6 // >> 6 |
|
xvsrai.w xr20, xr20, 6 // >> 6 |
|
xvmul.w xr15, xr15, xr1 // * wx |
|
xvmul.w xr20, xr20, xr1 // * wx |
|
xvadd.w xr15, xr15, xr2 // + offset |
|
xvadd.w xr20, xr20, xr2 // + offset |
|
xvsra.w xr15, xr15, xr3 // >> shift |
|
xvsra.w xr20, xr20, xr3 // >> shift |
|
xvadd.w xr15, xr15, xr4 // + ox |
|
xvadd.w xr20, xr20, xr4 // + ox |
|
xvssrani.h.w xr20, xr15, 0 |
|
xvpermi.q xr21, xr20, 0x01 |
|
vssrani.bu.h vr21, vr20, 0 |
|
vpermi.w vr21, vr21, 0xd8 |
|
.if \w < 16 |
|
fst.d f21, a0, 0 |
|
vstelm.w vr21, a0, 8, 2 |
|
.else |
|
vst vr21, a0, 0 |
|
.endif |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_HV16_HORI_LASX_\w |
|
.endm |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv6_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
vreplvei.w vr5, vr5, 0 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vreplvei.w vr16, vr6, 0 |
|
vreplvei.w vr17, vr6, 1 |
|
vreplvei.w vr18, vr6, 2 |
|
vreplvei.w vr19, vr6, 3 |
|
la.local t1, shufb |
|
vld vr0, t1, 0 |
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 6 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv6_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
xvreplve0.w xr5, xr5 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
xvreplve0.q xr6, xr6 |
|
xvrepl128vei.w xr16, xr6, 0 |
|
xvrepl128vei.w xr17, xr6, 1 |
|
xvrepl128vei.w xr18, xr6, 2 |
|
xvrepl128vei.w xr19, xr6, 3 |
|
la.local t1, shufb |
|
xvld xr0, t1, 0 |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
PUT_HEVC_EPEL_UNI_W_HV8_LASX 6 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv8_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
vreplvei.w vr5, vr5, 0 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vreplvei.w vr16, vr6, 0 |
|
vreplvei.w vr17, vr6, 1 |
|
vreplvei.w vr18, vr6, 2 |
|
vreplvei.w vr19, vr6, 3 |
|
la.local t1, shufb |
|
vld vr0, t1, 0 |
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 8 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv8_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
xvreplve0.w xr5, xr5 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
xvreplve0.q xr6, xr6 |
|
xvrepl128vei.w xr16, xr6, 0 |
|
xvrepl128vei.w xr17, xr6, 1 |
|
xvrepl128vei.w xr18, xr6, 2 |
|
xvrepl128vei.w xr19, xr6, 3 |
|
la.local t1, shufb |
|
xvld xr0, t1, 0 |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
PUT_HEVC_EPEL_UNI_W_HV8_LASX 8 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv12_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
vreplvei.w vr5, vr5, 0 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vreplvei.w vr16, vr6, 0 |
|
vreplvei.w vr17, vr6, 1 |
|
vreplvei.w vr18, vr6, 2 |
|
vreplvei.w vr19, vr6, 3 |
|
la.local t1, shufb |
|
vld vr0, t1, 0 |
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 12 |
|
addi.d a0, t2, 8 |
|
addi.d a2, t3, 8 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_HV4_LSX 12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv12_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
xvreplve0.w xr5, xr5 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
xvreplve0.q xr6, xr6 |
|
xvrepl128vei.w xr16, xr6, 0 |
|
xvrepl128vei.w xr17, xr6, 1 |
|
xvrepl128vei.w xr18, xr6, 2 |
|
xvrepl128vei.w xr19, xr6, 3 |
|
la.local t1, shufb |
|
xvld xr0, t1, 0 |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv16_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
vreplvei.w vr5, vr5, 0 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vreplvei.w vr16, vr6, 0 |
|
vreplvei.w vr17, vr6, 1 |
|
vreplvei.w vr18, vr6, 2 |
|
vreplvei.w vr19, vr6, 3 |
|
la.local t1, shufb |
|
vld vr0, t1, 0 |
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
addi.d t5, zero, 2 |
|
.LOOP_HV16: |
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 16 |
|
addi.d a0, t2, 8 |
|
addi.d a2, t3, 8 |
|
addi.d a4, t4, 0 |
|
addi.d t5, t5, -1 |
|
bnez t5, .LOOP_HV16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv16_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
xvreplve0.w xr5, xr5 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
xvreplve0.q xr6, xr6 |
|
xvrepl128vei.w xr16, xr6, 0 |
|
xvrepl128vei.w xr17, xr6, 1 |
|
xvrepl128vei.w xr18, xr6, 2 |
|
xvrepl128vei.w xr19, xr6, 3 |
|
la.local t1, shufb |
|
xvld xr0, t1, 0 |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv24_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
vreplvei.w vr5, vr5, 0 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vreplvei.w vr16, vr6, 0 |
|
vreplvei.w vr17, vr6, 1 |
|
vreplvei.w vr18, vr6, 2 |
|
vreplvei.w vr19, vr6, 3 |
|
la.local t1, shufb |
|
vld vr0, t1, 0 |
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
addi.d t5, zero, 3 |
|
.LOOP_HV24: |
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 24 |
|
addi.d a0, t2, 8 |
|
addi.d t2, t2, 8 |
|
addi.d a2, t3, 8 |
|
addi.d t3, t3, 8 |
|
addi.d a4, t4, 0 |
|
addi.d t5, t5, -1 |
|
bnez t5, .LOOP_HV24 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv24_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
xvreplve0.w xr5, xr5 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
xvreplve0.q xr6, xr6 |
|
xvrepl128vei.w xr16, xr6, 0 |
|
xvrepl128vei.w xr17, xr6, 1 |
|
xvrepl128vei.w xr18, xr6, 2 |
|
xvrepl128vei.w xr19, xr6, 3 |
|
la.local t1, shufb |
|
xvld xr0, t1, 0 |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 24 |
|
addi.d a0, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_HV8_LASX 24 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv32_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
vreplvei.w vr5, vr5, 0 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vreplvei.w vr16, vr6, 0 |
|
vreplvei.w vr17, vr6, 1 |
|
vreplvei.w vr18, vr6, 2 |
|
vreplvei.w vr19, vr6, 3 |
|
la.local t1, shufb |
|
vld vr0, t1, 0 |
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
addi.d t5, zero, 4 |
|
.LOOP_HV32: |
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 32 |
|
addi.d a0, t2, 8 |
|
addi.d t2, t2, 8 |
|
addi.d a2, t3, 8 |
|
addi.d t3, t3, 8 |
|
addi.d a4, t4, 0 |
|
addi.d t5, t5, -1 |
|
bnez t5, .LOOP_HV32 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv32_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
xvreplve0.w xr5, xr5 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
xvreplve0.q xr6, xr6 |
|
xvrepl128vei.w xr16, xr6, 0 |
|
xvrepl128vei.w xr17, xr6, 1 |
|
xvrepl128vei.w xr18, xr6, 2 |
|
xvrepl128vei.w xr19, xr6, 3 |
|
la.local t1, shufb |
|
xvld xr0, t1, 0 |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
addi.d t5, zero, 2 |
|
.LOOP_HV32_LASX: |
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 32 |
|
addi.d a0, t2, 16 |
|
addi.d t2, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d t3, t3, 16 |
|
addi.d a4, t4, 0 |
|
addi.d t5, t5, -1 |
|
bnez t5, .LOOP_HV32_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv48_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
vreplvei.w vr5, vr5, 0 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vreplvei.w vr16, vr6, 0 |
|
vreplvei.w vr17, vr6, 1 |
|
vreplvei.w vr18, vr6, 2 |
|
vreplvei.w vr19, vr6, 3 |
|
la.local t1, shufb |
|
vld vr0, t1, 0 |
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
addi.d t5, zero, 6 |
|
.LOOP_HV48: |
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 48 |
|
addi.d a0, t2, 8 |
|
addi.d t2, t2, 8 |
|
addi.d a2, t3, 8 |
|
addi.d t3, t3, 8 |
|
addi.d a4, t4, 0 |
|
addi.d t5, t5, -1 |
|
bnez t5, .LOOP_HV48 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
xvreplve0.w xr5, xr5 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
xvreplve0.q xr6, xr6 |
|
xvrepl128vei.w xr16, xr6, 0 |
|
xvrepl128vei.w xr17, xr6, 1 |
|
xvrepl128vei.w xr18, xr6, 2 |
|
xvrepl128vei.w xr19, xr6, 3 |
|
la.local t1, shufb |
|
xvld xr0, t1, 0 |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
addi.d t5, zero, 3 |
|
.LOOP_HV48_LASX: |
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 48 |
|
addi.d a0, t2, 16 |
|
addi.d t2, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d t3, t3, 16 |
|
addi.d a4, t4, 0 |
|
addi.d t5, t5, -1 |
|
bnez t5, .LOOP_HV48_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv64_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
vreplvei.w vr5, vr5, 0 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
vreplvei.w vr16, vr6, 0 |
|
vreplvei.w vr17, vr6, 1 |
|
vreplvei.w vr18, vr6, 2 |
|
vreplvei.w vr19, vr6, 3 |
|
la.local t1, shufb |
|
vld vr0, t1, 0 |
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
addi.d t5, zero, 8 |
|
.LOOP_HV64: |
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 64 |
|
addi.d a0, t2, 8 |
|
addi.d t2, t2, 8 |
|
addi.d a2, t3, 8 |
|
addi.d t3, t3, 8 |
|
addi.d a4, t4, 0 |
|
addi.d t5, t5, -1 |
|
bnez t5, .LOOP_HV64 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 // mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1]; |
|
xvreplve0.w xr5, xr5 |
|
ld.d t0, sp, 8 // my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1]; |
|
vsllwil.h.b vr6, vr6, 0 |
|
vsllwil.w.h vr6, vr6, 0 |
|
xvreplve0.q xr6, xr6 |
|
xvrepl128vei.w xr16, xr6, 0 |
|
xvrepl128vei.w xr17, xr6, 1 |
|
xvrepl128vei.w xr18, xr6, 2 |
|
xvrepl128vei.w xr19, xr6, 3 |
|
la.local t1, shufb |
|
xvld xr0, t1, 0 |
|
sub.d a2, a2, a3 // src -= srcstride |
|
addi.d a2, a2, -1 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
addi.d t5, zero, 4 |
|
.LOOP_HV64_LASX: |
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 64 |
|
addi.d a0, t2, 16 |
|
addi.d t2, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d t3, t3, 16 |
|
addi.d a4, t4, 0 |
|
addi.d t5, t5, -1 |
|
bnez t5, .LOOP_HV64_LASX |
|
endfunc |
|
|
|
/* |
|
* void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, |
|
* const uint8_t *_src, ptrdiff_t _srcstride, |
|
* int height, intptr_t mx, intptr_t my, |
|
* int width) |
|
*/ |
|
function ff_hevc_put_hevc_uni_qpel_h4_8_lsx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr5, t1, t0 //filter |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
vreplgr2vr.h vr1, t1 |
|
la.local t1, shufb |
|
vld vr2, t1, 32 //mask0 0 1 |
|
vaddi.bu vr3, vr2, 2 //mask1 2 3 |
|
.LOOP_UNI_H4: |
|
vld vr18, a2, 0 |
|
vldx vr19, a2, a3 |
|
alsl.d a2, a3, a2, 1 |
|
vshuf.b vr6, vr18, vr18, vr2 |
|
vshuf.b vr7, vr18, vr18, vr3 |
|
vshuf.b vr8, vr19, vr19, vr2 |
|
vshuf.b vr9, vr19, vr19, vr3 |
|
vdp2.h.bu.b vr10, vr6, vr5 |
|
vdp2.h.bu.b vr11, vr7, vr5 |
|
vdp2.h.bu.b vr12, vr8, vr5 |
|
vdp2.h.bu.b vr13, vr9, vr5 |
|
vhaddw.d.h vr10 |
|
vhaddw.d.h vr11 |
|
vhaddw.d.h vr12 |
|
vhaddw.d.h vr13 |
|
vpickev.w vr10, vr11, vr10 |
|
vpickev.w vr11, vr13, vr12 |
|
vpickev.h vr10, vr11, vr10 |
|
vadd.h vr10, vr10, vr1 |
|
vsrai.h vr10, vr10, 6 |
|
vssrani.bu.h vr10, vr10, 0 |
|
fst.s f10, a0, 0 |
|
vbsrl.v vr10, vr10, 4 |
|
fstx.s f10, a0, a1 |
|
alsl.d a0, a1, a0, 1 |
|
addi.d a4, a4, -2 |
|
bnez a4, .LOOP_UNI_H4 |
|
endfunc |
|
|
|
.macro HEVC_UNI_QPEL_H8_LSX in0, out0 |
|
vshuf.b vr10, \in0, \in0, vr5 |
|
vshuf.b vr11, \in0, \in0, vr6 |
|
vshuf.b vr12, \in0, \in0, vr7 |
|
vshuf.b vr13, \in0, \in0, vr8 |
|
vdp2.h.bu.b \out0, vr10, vr0 //(QPEL_FILTER(src, 1) |
|
vdp2add.h.bu.b \out0, vr11, vr1 |
|
vdp2add.h.bu.b \out0, vr12, vr2 |
|
vdp2add.h.bu.b \out0, vr13, vr3 |
|
vadd.h \out0, \out0, vr4 |
|
vsrai.h \out0, \out0, 6 |
|
.endm |
|
|
|
.macro HEVC_UNI_QPEL_H16_LASX in0, out0 |
|
xvshuf.b xr10, \in0, \in0, xr5 |
|
xvshuf.b xr11, \in0, \in0, xr6 |
|
xvshuf.b xr12, \in0, \in0, xr7 |
|
xvshuf.b xr13, \in0, \in0, xr8 |
|
xvdp2.h.bu.b \out0, xr10, xr0 //(QPEL_FILTER(src, 1) |
|
xvdp2add.h.bu.b \out0, xr11, xr1 |
|
xvdp2add.h.bu.b \out0, xr12, xr2 |
|
xvdp2add.h.bu.b \out0, xr13, xr3 |
|
xvadd.h \out0, \out0, xr4 |
|
xvsrai.h \out0, \out0, 6 |
|
.endm |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h6_8_lsx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
vreplvei.h vr1, vr0, 1 //cd... |
|
vreplvei.h vr2, vr0, 2 //ef... |
|
vreplvei.h vr3, vr0, 3 //gh... |
|
vreplvei.h vr0, vr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
vreplgr2vr.h vr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
vaddi.bu vr6, vr5, 2 |
|
vaddi.bu vr7, vr5, 4 |
|
vaddi.bu vr8, vr5, 6 |
|
.LOOP_UNI_H6: |
|
vld vr9, a2, 0 |
|
add.d a2, a2, a3 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14 |
|
vssrani.bu.h vr14, vr14, 0 |
|
fst.s f14, a0, 0 |
|
vstelm.h vr14, a0, 4, 2 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H6 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h8_8_lsx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
vreplvei.h vr1, vr0, 1 //cd... |
|
vreplvei.h vr2, vr0, 2 //ef... |
|
vreplvei.h vr3, vr0, 3 //gh... |
|
vreplvei.h vr0, vr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
vreplgr2vr.h vr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
vaddi.bu vr6, vr5, 2 |
|
vaddi.bu vr7, vr5, 4 |
|
vaddi.bu vr8, vr5, 6 |
|
.LOOP_UNI_H8: |
|
vld vr9, a2, 0 |
|
add.d a2, a2, a3 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14 |
|
vssrani.bu.h vr14, vr14, 0 |
|
fst.d f14, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H8 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h12_8_lsx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
vreplvei.h vr1, vr0, 1 //cd... |
|
vreplvei.h vr2, vr0, 2 //ef... |
|
vreplvei.h vr3, vr0, 3 //gh... |
|
vreplvei.h vr0, vr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
vreplgr2vr.h vr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
vaddi.bu vr6, vr5, 2 |
|
vaddi.bu vr7, vr5, 4 |
|
vaddi.bu vr8, vr5, 6 |
|
.LOOP_UNI_H12: |
|
vld vr9, a2, 0 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14 |
|
vld vr9, a2, 8 |
|
add.d a2, a2, a3 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15 |
|
vssrani.bu.h vr15, vr14, 0 |
|
fst.d f15, a0, 0 |
|
vstelm.w vr15, a0, 8, 2 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h12_8_lasx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 //cd... |
|
xvrepl128vei.h xr2, xr0, 2 //ef... |
|
xvrepl128vei.h xr3, xr0, 3 //gh... |
|
xvrepl128vei.h xr0, xr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
xvreplgr2vr.h xr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
xvreplve0.q xr5, xr5 |
|
xvaddi.bu xr6, xr5, 2 |
|
xvaddi.bu xr7, xr5, 4 |
|
xvaddi.bu xr8, xr5, 6 |
|
.LOOP_UNI_H12_LASX: |
|
xvld xr9, a2, 0 |
|
add.d a2, a2, a3 |
|
xvpermi.d xr9, xr9, 0x94 //rearrange data |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14 |
|
xvpermi.q xr15, xr14, 0x01 |
|
vssrani.bu.h vr15, vr14, 0 |
|
fst.d f15, a0, 0 |
|
vstelm.w vr15, a0, 8, 2 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H12_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h16_8_lsx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
vreplvei.h vr1, vr0, 1 //cd... |
|
vreplvei.h vr2, vr0, 2 //ef... |
|
vreplvei.h vr3, vr0, 3 //gh... |
|
vreplvei.h vr0, vr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
vreplgr2vr.h vr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
vaddi.bu vr6, vr5, 2 |
|
vaddi.bu vr7, vr5, 4 |
|
vaddi.bu vr8, vr5, 6 |
|
.LOOP_UNI_H16: |
|
vld vr9, a2, 0 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14 |
|
vld vr9, a2, 8 |
|
add.d a2, a2, a3 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15 |
|
vssrani.bu.h vr15, vr14, 0 |
|
vst vr15, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h16_8_lasx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 //cd... |
|
xvrepl128vei.h xr2, xr0, 2 //ef... |
|
xvrepl128vei.h xr3, xr0, 3 //gh... |
|
xvrepl128vei.h xr0, xr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
xvreplgr2vr.h xr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
xvreplve0.q xr5, xr5 |
|
xvaddi.bu xr6, xr5, 2 |
|
xvaddi.bu xr7, xr5, 4 |
|
xvaddi.bu xr8, xr5, 6 |
|
.LOOP_UNI_H16_LASX: |
|
xvld xr9, a2, 0 |
|
add.d a2, a2, a3 |
|
xvpermi.d xr9, xr9, 0x94 //rearrange data |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14 |
|
xvpermi.q xr15, xr14, 0x01 |
|
vssrani.bu.h vr15, vr14, 0 |
|
vst vr15, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H16_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h24_8_lsx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
vreplvei.h vr1, vr0, 1 //cd... |
|
vreplvei.h vr2, vr0, 2 //ef... |
|
vreplvei.h vr3, vr0, 3 //gh... |
|
vreplvei.h vr0, vr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
vreplgr2vr.h vr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
vaddi.bu vr6, vr5, 2 |
|
vaddi.bu vr7, vr5, 4 |
|
vaddi.bu vr8, vr5, 6 |
|
.LOOP_UNI_H24: |
|
vld vr9, a2, 0 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14 |
|
vld vr9, a2, 8 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15 |
|
vld vr9, a2, 16 |
|
add.d a2, a2, a3 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr16 |
|
vssrani.bu.h vr15, vr14, 0 |
|
vssrani.bu.h vr16, vr16, 0 |
|
vst vr15, a0, 0 |
|
fst.d f16, a0, 16 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H24 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h24_8_lasx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 //cd... |
|
xvrepl128vei.h xr2, xr0, 2 //ef... |
|
xvrepl128vei.h xr3, xr0, 3 //gh... |
|
xvrepl128vei.h xr0, xr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
xvreplgr2vr.h xr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
xvreplve0.q xr5, xr5 |
|
xvaddi.bu xr6, xr5, 2 |
|
xvaddi.bu xr7, xr5, 4 |
|
xvaddi.bu xr8, xr5, 6 |
|
.LOOP_UNI_H24_LASX: |
|
xvld xr9, a2, 0 |
|
xvpermi.q xr19, xr9, 0x01 //16...23 |
|
add.d a2, a2, a3 |
|
xvpermi.d xr9, xr9, 0x94 //rearrange data |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14 |
|
xvpermi.q xr15, xr14, 0x01 |
|
vssrani.bu.h vr15, vr14, 0 |
|
vst vr15, a0, 0 |
|
HEVC_UNI_QPEL_H8_LSX vr19, vr16 |
|
vssrani.bu.h vr16, vr16, 0 |
|
fst.d f16, a0, 16 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H24_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h32_8_lsx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
vreplvei.h vr1, vr0, 1 //cd... |
|
vreplvei.h vr2, vr0, 2 //ef... |
|
vreplvei.h vr3, vr0, 3 //gh... |
|
vreplvei.h vr0, vr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
vreplgr2vr.h vr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
vaddi.bu vr6, vr5, 2 |
|
vaddi.bu vr7, vr5, 4 |
|
vaddi.bu vr8, vr5, 6 |
|
.LOOP_UNI_H32: |
|
vld vr9, a2, 0 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14 |
|
vld vr9, a2, 8 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15 |
|
vld vr9, a2, 16 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr16 |
|
vld vr9, a2, 24 |
|
add.d a2, a2, a3 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr17 |
|
vssrani.bu.h vr15, vr14, 0 |
|
vssrani.bu.h vr17, vr16, 0 |
|
vst vr15, a0, 0 |
|
vst vr17, a0, 16 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H32 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h32_8_lasx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 //cd... |
|
xvrepl128vei.h xr2, xr0, 2 //ef... |
|
xvrepl128vei.h xr3, xr0, 3 //gh... |
|
xvrepl128vei.h xr0, xr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
xvreplgr2vr.h xr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
xvreplve0.q xr5, xr5 |
|
xvaddi.bu xr6, xr5, 2 |
|
xvaddi.bu xr7, xr5, 4 |
|
xvaddi.bu xr8, xr5, 6 |
|
.LOOP_UNI_H32_LASX: |
|
xvld xr9, a2, 0 |
|
xvpermi.d xr9, xr9, 0x94 |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14 |
|
xvld xr9, a2, 16 |
|
xvpermi.d xr9, xr9, 0x94 |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr15 |
|
add.d a2, a2, a3 |
|
xvssrani.bu.h xr15, xr14, 0 |
|
xvpermi.d xr15, xr15, 0xd8 |
|
xvst xr15, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H32_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h48_8_lsx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
vreplvei.h vr1, vr0, 1 //cd... |
|
vreplvei.h vr2, vr0, 2 //ef... |
|
vreplvei.h vr3, vr0, 3 //gh... |
|
vreplvei.h vr0, vr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
vreplgr2vr.h vr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
vaddi.bu vr6, vr5, 2 |
|
vaddi.bu vr7, vr5, 4 |
|
vaddi.bu vr8, vr5, 6 |
|
.LOOP_UNI_H48: |
|
vld vr9, a2, 0 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14 |
|
vld vr9, a2, 8 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15 |
|
vld vr9, a2, 16 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr16 |
|
vld vr9, a2, 24 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr17 |
|
vld vr9, a2, 32 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr18 |
|
vld vr9, a2, 40 |
|
add.d a2, a2, a3 |
|
HEVC_UNI_QPEL_H8_LSX vr9, vr19 |
|
vssrani.bu.h vr15, vr14, 0 |
|
vssrani.bu.h vr17, vr16, 0 |
|
vssrani.bu.h vr19, vr18, 0 |
|
vst vr15, a0, 0 |
|
vst vr17, a0, 16 |
|
vst vr19, a0, 32 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H48 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h48_8_lasx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 //cd... |
|
xvrepl128vei.h xr2, xr0, 2 //ef... |
|
xvrepl128vei.h xr3, xr0, 3 //gh... |
|
xvrepl128vei.h xr0, xr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
xvreplgr2vr.h xr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
xvreplve0.q xr5, xr5 |
|
xvaddi.bu xr6, xr5, 2 |
|
xvaddi.bu xr7, xr5, 4 |
|
xvaddi.bu xr8, xr5, 6 |
|
.LOOP_UNI_H48_LASX: |
|
xvld xr9, a2, 0 |
|
xvpermi.d xr9, xr9, 0x94 |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14 |
|
xvld xr9, a2, 16 |
|
xvpermi.d xr9, xr9, 0x94 |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr15 |
|
xvld xr9, a2, 32 |
|
xvpermi.d xr9, xr9, 0x94 |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr16 |
|
add.d a2, a2, a3 |
|
xvssrani.bu.h xr15, xr14, 0 |
|
xvpermi.d xr15, xr15, 0xd8 |
|
xvst xr15, a0, 0 |
|
xvpermi.q xr17, xr16, 0x01 |
|
vssrani.bu.h vr17, vr16, 0 |
|
vst vr17, a0, 32 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H48_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_uni_qpel_h64_8_lasx |
|
addi.d t0, a5, -1 |
|
slli.w t0, t0, 4 |
|
la.local t1, ff_hevc_qpel_filters |
|
vldx vr0, t1, t0 //filter abcdefgh |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 //cd... |
|
xvrepl128vei.h xr2, xr0, 2 //ef... |
|
xvrepl128vei.h xr3, xr0, 3 //gh... |
|
xvrepl128vei.h xr0, xr0, 0 //ab... |
|
addi.d a2, a2, -3 //src -= 3 |
|
addi.w t1, zero, 32 |
|
xvreplgr2vr.h xr4, t1 |
|
la.local t1, shufb |
|
vld vr5, t1, 48 |
|
xvreplve0.q xr5, xr5 |
|
xvaddi.bu xr6, xr5, 2 |
|
xvaddi.bu xr7, xr5, 4 |
|
xvaddi.bu xr8, xr5, 6 |
|
.LOOP_UNI_H64_LASX: |
|
xvld xr9, a2, 0 |
|
xvpermi.d xr9, xr9, 0x94 |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14 |
|
xvld xr9, a2, 16 |
|
xvpermi.d xr9, xr9, 0x94 |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr15 |
|
xvld xr9, a2, 32 |
|
xvpermi.d xr9, xr9, 0x94 |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr16 |
|
xvld xr9, a2, 48 |
|
xvpermi.d xr9, xr9, 0x94 |
|
HEVC_UNI_QPEL_H16_LASX xr9, xr17 |
|
add.d a2, a2, a3 |
|
xvssrani.bu.h xr15, xr14, 0 |
|
xvpermi.d xr15, xr15, 0xd8 |
|
xvst xr15, a0, 0 |
|
xvssrani.bu.h xr17, xr16, 0 |
|
xvpermi.d xr17, xr17, 0xd8 |
|
xvst xr17, a0, 32 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_H64_LASX |
|
endfunc |
|
|
|
/* |
|
* void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, |
|
* const uint8_t *_src, ptrdiff_t _srcstride, |
|
* int height, int denom, int wx, int ox, |
|
* intptr_t mx, intptr_t my, int width) |
|
*/ |
|
function ff_hevc_put_hevc_epel_uni_w_v4_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
fld.s f6, a2, 0 //0 |
|
fldx.s f7, a2, a3 //1 |
|
fldx.s f8, a2, t0 //2 |
|
add.d a2, a2, t1 |
|
vilvl.b vr6, vr7, vr6 |
|
vilvl.b vr7, vr8, vr8 |
|
vilvl.h vr6, vr7, vr6 |
|
vreplvei.w vr0, vr0, 0 |
|
.LOOP_UNI_V4: |
|
fld.s f9, a2, 0 //3 |
|
fldx.s f10, a2, a3 //4 |
|
add.d a2, a2, t0 |
|
vextrins.b vr6, vr9, 0x30 //insert the 3th load |
|
vextrins.b vr6, vr9, 0x71 |
|
vextrins.b vr6, vr9, 0xb2 |
|
vextrins.b vr6, vr9, 0xf3 |
|
vbsrl.v vr7, vr6, 1 |
|
vextrins.b vr7, vr10, 0x30 //insert the 4th load |
|
vextrins.b vr7, vr10, 0x71 |
|
vextrins.b vr7, vr10, 0xb2 |
|
vextrins.b vr7, vr10, 0xf3 |
|
vdp2.h.bu.b vr8, vr6, vr0 //EPEL_FILTER(src, stride) |
|
vdp2.h.bu.b vr9, vr7, vr0 |
|
vhaddw.w.h vr10, vr8, vr8 |
|
vhaddw.w.h vr11, vr9, vr9 |
|
vmulwev.w.h vr10, vr10, vr1 //EPEL_FILTER(src, stride) * wx |
|
vmulwev.w.h vr11, vr11, vr1 |
|
vadd.w vr10, vr10, vr2 // + offset |
|
vadd.w vr11, vr11, vr2 |
|
vsra.w vr10, vr10, vr3 // >> shift |
|
vsra.w vr11, vr11, vr3 |
|
vadd.w vr10, vr10, vr4 // + ox |
|
vadd.w vr11, vr11, vr4 |
|
vssrani.h.w vr11, vr10, 0 |
|
vssrani.bu.h vr10, vr11, 0 |
|
vbsrl.v vr6, vr7, 1 |
|
fst.s f10, a0, 0 |
|
vbsrl.v vr10, vr10, 4 |
|
fstx.s f10, a0, a1 |
|
alsl.d a0, a1, a0, 1 |
|
addi.d a4, a4, -2 |
|
bnez a4, .LOOP_UNI_V4 |
|
endfunc |
|
|
|
.macro CALC_EPEL_FILTER_LSX out0, out1 |
|
vdp2.h.bu.b vr12, vr10, vr0 //EPEL_FILTER(src, stride) |
|
vdp2add.h.bu.b vr12, vr11, vr5 |
|
vexth.w.h vr13, vr12 |
|
vsllwil.w.h vr12, vr12, 0 |
|
vmulwev.w.h vr12, vr12, vr1 //EPEL_FILTER(src, stride) * wx |
|
vmulwev.w.h vr13, vr13, vr1 //EPEL_FILTER(src, stride) * wx |
|
vadd.w vr12, vr12, vr2 // + offset |
|
vadd.w vr13, vr13, vr2 |
|
vsra.w vr12, vr12, vr3 // >> shift |
|
vsra.w vr13, vr13, vr3 |
|
vadd.w \out0, vr12, vr4 // + ox |
|
vadd.w \out1, vr13, vr4 |
|
.endm |
|
|
|
.macro CALC_EPEL_FILTER_LASX out0 |
|
xvdp2.h.bu.b xr11, xr12, xr0 //EPEL_FILTER(src, stride) |
|
xvhaddw.w.h xr12, xr11, xr11 |
|
xvmulwev.w.h xr12, xr12, xr1 //EPEL_FILTER(src, stride) * wx |
|
xvadd.w xr12, xr12, xr2 // + offset |
|
xvsra.w xr12, xr12, xr3 // >> shift |
|
xvadd.w \out0, xr12, xr4 // + ox |
|
.endm |
|
|
|
//w is a label, also can be used as a condition for ".if" statement. |
|
.macro PUT_HEVC_EPEL_UNI_W_V8_LSX w |
|
fld.d f6, a2, 0 //0 |
|
fldx.d f7, a2, a3 //1 |
|
fldx.d f8, a2, t0 //2 |
|
add.d a2, a2, t1 |
|
.LOOP_UNI_V8_\w: |
|
fld.d f9, a2, 0 // 3 |
|
add.d a2, a2, a3 |
|
vilvl.b vr10, vr7, vr6 |
|
vilvl.b vr11, vr9, vr8 |
|
vaddi.bu vr6, vr7, 0 //back up previous value |
|
vaddi.bu vr7, vr8, 0 |
|
vaddi.bu vr8, vr9, 0 |
|
CALC_EPEL_FILTER_LSX vr12, vr13 |
|
vssrani.h.w vr13, vr12, 0 |
|
vssrani.bu.h vr13, vr13, 0 |
|
.if \w < 8 |
|
fst.s f13, a0, 0 |
|
vstelm.h vr13, a0, 4, 2 |
|
.else |
|
fst.d f13, a0, 0 |
|
.endif |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_V8_\w |
|
.endm |
|
|
|
//w is a label, also can be used as a condition for ".if" statement. |
|
.macro PUT_HEVC_EPEL_UNI_W_V8_LASX w |
|
fld.d f6, a2, 0 //0 |
|
fldx.d f7, a2, a3 //1 |
|
fldx.d f8, a2, t0 //2 |
|
add.d a2, a2, t1 |
|
.LOOP_UNI_V8_LASX_\w: |
|
fld.d f9, a2, 0 // 3 |
|
add.d a2, a2, a3 |
|
vilvl.b vr10, vr7, vr6 |
|
vilvl.b vr11, vr9, vr8 |
|
xvilvl.h xr12, xr11, xr10 |
|
xvilvh.h xr13, xr11, xr10 |
|
xvpermi.q xr12, xr13, 0x02 |
|
vaddi.bu vr6, vr7, 0 //back up previous value |
|
vaddi.bu vr7, vr8, 0 |
|
vaddi.bu vr8, vr9, 0 |
|
CALC_EPEL_FILTER_LASX xr12 |
|
xvpermi.q xr13, xr12, 0x01 |
|
vssrani.h.w vr13, vr12, 0 |
|
vssrani.bu.h vr13, vr13, 0 |
|
.if \w < 8 |
|
fst.s f13, a0, 0 |
|
vstelm.h vr13, a0, 4, 2 |
|
.else |
|
fst.d f13, a0, 0 |
|
.endif |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_V8_LASX_\w |
|
.endm |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v6_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
PUT_HEVC_EPEL_UNI_W_V8_LSX 6 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v6_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
PUT_HEVC_EPEL_UNI_W_V8_LASX 6 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v8_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
PUT_HEVC_EPEL_UNI_W_V8_LSX 8 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v8_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
PUT_HEVC_EPEL_UNI_W_V8_LASX 8 |
|
endfunc |
|
|
|
//w is a label, also can be used as a condition for ".if" statement. |
|
.macro PUT_HEVC_EPEL_UNI_W_V16_LSX w |
|
vld vr6, a2, 0 //0 |
|
vldx vr7, a2, a3 //1 |
|
vldx vr8, a2, t0 //2 |
|
add.d a2, a2, t1 |
|
.LOOP_UNI_V16_\w: |
|
vld vr9, a2, 0 //3 |
|
add.d a2, a2, a3 |
|
vilvl.b vr10, vr7, vr6 |
|
vilvl.b vr11, vr9, vr8 |
|
CALC_EPEL_FILTER_LSX vr14, vr15 |
|
vilvh.b vr10, vr7, vr6 |
|
vilvh.b vr11, vr9, vr8 |
|
CALC_EPEL_FILTER_LSX vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vaddi.bu vr6, vr7, 0 //back up previous value |
|
vaddi.bu vr7, vr8, 0 |
|
vaddi.bu vr8, vr9, 0 |
|
.if \w < 16 |
|
fst.d f17, a0, 0 |
|
vstelm.w vr17, a0, 8, 2 |
|
.else |
|
vst vr17, a0, 0 |
|
.endif |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_V16_\w |
|
.endm |
|
|
|
//w is a label, also can be used as a condition for ".if" statement. |
|
.macro PUT_HEVC_EPEL_UNI_W_V16_LASX w |
|
vld vr6, a2, 0 //0 |
|
vldx vr7, a2, a3 //1 |
|
vldx vr8, a2, t0 //2 |
|
add.d a2, a2, t1 |
|
.LOOP_UNI_V16_LASX_\w: |
|
vld vr9, a2, 0 //3 |
|
add.d a2, a2, a3 |
|
xvilvl.b xr10, xr7, xr6 |
|
xvilvh.b xr11, xr7, xr6 |
|
xvpermi.q xr11, xr10, 0x20 |
|
xvilvl.b xr12, xr9, xr8 |
|
xvilvh.b xr13, xr9, xr8 |
|
xvpermi.q xr13, xr12, 0x20 |
|
xvdp2.h.bu.b xr10, xr11, xr0 //EPEL_FILTER(src, stride) |
|
xvdp2add.h.bu.b xr10, xr13, xr5 |
|
xvexth.w.h xr11, xr10 |
|
xvsllwil.w.h xr10, xr10, 0 |
|
xvmulwev.w.h xr10, xr10, xr1 //EPEL_FILTER(src, stride) * wx |
|
xvmulwev.w.h xr11, xr11, xr1 |
|
xvadd.w xr10, xr10, xr2 // + offset |
|
xvadd.w xr11, xr11, xr2 |
|
xvsra.w xr10, xr10, xr3 // >> shift |
|
xvsra.w xr11, xr11, xr3 |
|
xvadd.w xr10, xr10, xr4 // + wx |
|
xvadd.w xr11, xr11, xr4 |
|
xvssrani.h.w xr11, xr10, 0 |
|
xvpermi.q xr10, xr11, 0x01 |
|
vssrani.bu.h vr10, vr11, 0 |
|
vaddi.bu vr6, vr7, 0 //back up previous value |
|
vaddi.bu vr7, vr8, 0 |
|
vaddi.bu vr8, vr9, 0 |
|
.if \w < 16 |
|
fst.d f10, a0, 0 |
|
vstelm.w vr10, a0, 8, 2 |
|
.else |
|
vst vr10, a0, 0 |
|
.endif |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_V16_LASX_\w |
|
.endm |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v12_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v12_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.q xr0, xr0 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
xvrepl128vei.h xr5, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v16_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v16_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.q xr0, xr0 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
xvrepl128vei.h xr5, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v24_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
addi.d t2, a0, 0 //save init |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 24 |
|
addi.d a0, t2, 16 //increase step |
|
addi.d a2, t3, 16 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V8_LSX 24 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v24_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr20, xr0 //save xr0 |
|
xvreplve0.q xr0, xr0 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
xvrepl128vei.h xr5, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
addi.d t2, a0, 0 //save init |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 24 |
|
addi.d a0, t2, 16 //increase step |
|
addi.d a2, t3, 16 |
|
addi.d a4, t4, 0 |
|
xvaddi.bu xr0, xr20, 0 |
|
PUT_HEVC_EPEL_UNI_W_V8_LASX 24 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v32_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 32 |
|
addi.d a0, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 33 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v32_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.q xr0, xr0 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
xvrepl128vei.h xr5, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 32 |
|
addi.d a0, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 33 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v48_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 48 |
|
addi.d a0, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 49 |
|
addi.d a0, t2, 32 |
|
addi.d a2, t3, 32 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 50 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v48_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.q xr0, xr0 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
xvrepl128vei.h xr5, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 48 |
|
addi.d a0, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 49 |
|
addi.d a0, t2, 32 |
|
addi.d a2, t3, 32 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 50 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v64_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 64 |
|
addi.d a0, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 65 |
|
addi.d a0, t2, 32 |
|
addi.d a2, t3, 32 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 66 |
|
addi.d a0, t2, 48 |
|
addi.d a2, t3, 48 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 67 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v64_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 8 //my |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.q xr0, xr0 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
sub.d a2, a2, a3 //src -= stride |
|
xvrepl128vei.h xr5, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
addi.d t2, a0, 0 |
|
addi.d t3, a2, 0 |
|
addi.d t4, a4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 64 |
|
addi.d a0, t2, 16 |
|
addi.d a2, t3, 16 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 65 |
|
addi.d a0, t2, 32 |
|
addi.d a2, t3, 32 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 66 |
|
addi.d a0, t2, 48 |
|
addi.d a2, t3, 48 |
|
addi.d a4, t4, 0 |
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 67 |
|
endfunc |
|
|
|
/* |
|
* void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, |
|
* const uint8_t *_src, ptrdiff_t _srcstride, |
|
* int height, int denom, int wx, int ox, |
|
* intptr_t mx, intptr_t my, int width) |
|
*/ |
|
function ff_hevc_put_hevc_epel_uni_w_h4_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t1, shufb |
|
vld vr5, t1, 0 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
.LOOP_UNI_W_H4: |
|
fld.d f6, a2, 0 |
|
add.d a2, a2, a3 |
|
vshuf.b vr6, vr6, vr6, vr5 |
|
vdp2.h.bu.b vr7, vr6, vr0 |
|
vhaddw.w.h vr7, vr7, vr7 |
|
vmulwev.w.h vr7, vr7, vr1 |
|
vadd.w vr7, vr7, vr2 |
|
vsra.w vr7, vr7, vr3 |
|
vadd.w vr7, vr7, vr4 |
|
vssrani.h.w vr7, vr7, 0 |
|
vssrani.bu.h vr7, vr7, 0 |
|
fst.s f7, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H4 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h6_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t1, shufb |
|
vld vr6, t1, 48 |
|
vaddi.bu vr7, vr6, 2 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
.LOOP_UNI_W_H6: |
|
vld vr8, a2, 0 |
|
add.d a2, a2, a3 |
|
vshuf.b vr10, vr8, vr8, vr6 |
|
vshuf.b vr11, vr8, vr8, vr7 |
|
CALC_EPEL_FILTER_LSX vr14, vr15 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.bu.h vr15, vr15, 0 |
|
fst.s f15, a0, 0 |
|
vstelm.h vr15, a0, 4, 2 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H6 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h6_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
la.local t1, shufb |
|
xvld xr6, t1, 64 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
.LOOP_UNI_W_H6_LASX: |
|
vld vr8, a2, 0 |
|
xvreplve0.q xr8, xr8 |
|
add.d a2, a2, a3 |
|
xvshuf.b xr12, xr8, xr8, xr6 |
|
CALC_EPEL_FILTER_LASX xr14 |
|
xvpermi.q xr15, xr14, 0x01 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.bu.h vr15, vr15, 0 |
|
fst.s f15, a0, 0 |
|
vstelm.h vr15, a0, 4, 2 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H6_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h8_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t1, shufb |
|
vld vr6, t1, 48 |
|
vaddi.bu vr7, vr6, 2 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
.LOOP_UNI_W_H8: |
|
vld vr8, a2, 0 |
|
add.d a2, a2, a3 |
|
vshuf.b vr10, vr8, vr8, vr6 |
|
vshuf.b vr11, vr8, vr8, vr7 |
|
CALC_EPEL_FILTER_LSX vr14, vr15 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.bu.h vr15, vr15, 0 |
|
fst.d f15, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H8 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h8_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
la.local t1, shufb |
|
xvld xr6, t1, 64 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
.LOOP_UNI_W_H8_LASX: |
|
vld vr8, a2, 0 |
|
xvreplve0.q xr8, xr8 |
|
add.d a2, a2, a3 |
|
xvshuf.b xr12, xr8, xr8, xr6 |
|
CALC_EPEL_FILTER_LASX xr14 |
|
xvpermi.q xr15, xr14, 0x01 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.bu.h vr15, vr15, 0 |
|
fst.d f15, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H8_LASX |
|
endfunc |
|
|
|
.macro EPEL_UNI_W_H16_LOOP_LSX idx0, idx1, idx2 |
|
vld vr8, a2, \idx0 |
|
vshuf.b vr10, vr8, vr8, vr6 |
|
vshuf.b vr11, vr8, vr8, vr7 |
|
CALC_EPEL_FILTER_LSX vr14, vr15 |
|
vld vr8, a2, \idx1 |
|
vshuf.b vr10, vr8, vr8, vr6 |
|
vshuf.b vr11, vr8, vr8, vr7 |
|
CALC_EPEL_FILTER_LSX vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
vst vr17, a0, \idx2 |
|
.endm |
|
|
|
.macro EPEL_UNI_W_H16_LOOP_LASX idx0, idx2, w |
|
xvld xr8, a2, \idx0 |
|
xvpermi.d xr9, xr8, 0x09 |
|
xvreplve0.q xr8, xr8 |
|
xvshuf.b xr12, xr8, xr8, xr6 |
|
CALC_EPEL_FILTER_LASX xr14 |
|
xvreplve0.q xr8, xr9 |
|
xvshuf.b xr12, xr8, xr8, xr6 |
|
CALC_EPEL_FILTER_LASX xr16 |
|
xvssrani.h.w xr16, xr14, 0 |
|
xvpermi.q xr17, xr16, 0x01 |
|
vssrani.bu.h vr17, vr16, 0 |
|
vpermi.w vr17, vr17, 0xd8 |
|
.if \w == 12 |
|
fst.d f17, a0, 0 |
|
vstelm.w vr17, a0, 8, 2 |
|
.else |
|
vst vr17, a0, \idx2 |
|
.endif |
|
.endm |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h12_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t1, shufb |
|
vld vr6, t1, 48 |
|
vaddi.bu vr7, vr6, 2 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
.LOOP_UNI_W_H12: |
|
vld vr8, a2, 0 |
|
vshuf.b vr10, vr8, vr8, vr6 |
|
vshuf.b vr11, vr8, vr8, vr7 |
|
CALC_EPEL_FILTER_LSX vr14, vr15 |
|
vld vr8, a2, 8 |
|
vshuf.b vr10, vr8, vr8, vr6 |
|
vshuf.b vr11, vr8, vr8, vr7 |
|
CALC_EPEL_FILTER_LSX vr16, vr17 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.h.w vr17, vr16, 0 |
|
vssrani.bu.h vr17, vr15, 0 |
|
fst.d f17, a0, 0 |
|
vstelm.w vr17, a0, 8, 2 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h12_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
la.local t1, shufb |
|
xvld xr6, t1, 64 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
.LOOP_UNI_W_H12_LASX: |
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 12 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H12_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h16_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t1, shufb |
|
vld vr6, t1, 48 |
|
vaddi.bu vr7, vr6, 2 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
.LOOP_UNI_W_H16: |
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h16_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
la.local t1, shufb |
|
xvld xr6, t1, 64 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
.LOOP_UNI_W_H16_LASX: |
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 16 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H16_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h24_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t1, shufb |
|
vld vr6, t1, 48 |
|
vaddi.bu vr7, vr6, 2 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
.LOOP_UNI_W_H24: |
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0 |
|
vld vr8, a2, 16 |
|
add.d a2, a2, a3 |
|
vshuf.b vr10, vr8, vr8, vr6 |
|
vshuf.b vr11, vr8, vr8, vr7 |
|
CALC_EPEL_FILTER_LSX vr18, vr19 |
|
vssrani.h.w vr19, vr18, 0 |
|
vssrani.bu.h vr19, vr19, 0 |
|
fst.d f19, a0, 16 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H24 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h24_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
la.local t1, shufb |
|
xvld xr6, t1, 64 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
.LOOP_UNI_W_H24_LASX: |
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 24 |
|
vld vr8, a2, 16 |
|
add.d a2, a2, a3 |
|
xvreplve0.q xr8, xr8 |
|
xvshuf.b xr12, xr8, xr8, xr6 |
|
CALC_EPEL_FILTER_LASX xr14 |
|
xvpermi.q xr15, xr14, 0x01 |
|
vssrani.h.w vr15, vr14, 0 |
|
vssrani.bu.h vr15, vr15, 0 |
|
fst.d f15, a0, 16 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H24_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h32_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t1, shufb |
|
vld vr6, t1, 48 |
|
vaddi.bu vr7, vr6, 2 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
.LOOP_UNI_W_H32: |
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0 |
|
EPEL_UNI_W_H16_LOOP_LSX 16, 24, 16 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H32 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h32_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
la.local t1, shufb |
|
xvld xr6, t1, 64 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
.LOOP_UNI_W_H32_LASX: |
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 32 |
|
EPEL_UNI_W_H16_LOOP_LASX 16, 16, 32 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H32_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h48_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t1, shufb |
|
vld vr6, t1, 48 |
|
vaddi.bu vr7, vr6, 2 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
.LOOP_UNI_W_H48: |
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0 |
|
EPEL_UNI_W_H16_LOOP_LSX 16, 24, 16 |
|
EPEL_UNI_W_H16_LOOP_LSX 32, 40, 32 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H48 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h48_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
la.local t1, shufb |
|
xvld xr6, t1, 64 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
.LOOP_UNI_W_H48_LASX: |
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 48 |
|
EPEL_UNI_W_H16_LOOP_LASX 16, 16, 48 |
|
EPEL_UNI_W_H16_LOOP_LASX 32, 32, 48 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H48_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h64_8_lsx |
|
LOAD_VAR 128 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t1, shufb |
|
vld vr6, t1, 48 |
|
vaddi.bu vr7, vr6, 2 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
vreplvei.h vr5, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
.LOOP_UNI_W_H64: |
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0 |
|
EPEL_UNI_W_H16_LOOP_LSX 16, 24, 16 |
|
EPEL_UNI_W_H16_LOOP_LSX 32, 40, 32 |
|
EPEL_UNI_W_H16_LOOP_LSX 48, 56, 48 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H64 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h64_8_lasx |
|
LOAD_VAR 256 |
|
ld.d t0, sp, 0 //mx |
|
addi.d t0, t0, -1 |
|
slli.w t0, t0, 2 |
|
la.local t1, ff_hevc_epel_filters |
|
vldx vr0, t1, t0 //filter |
|
xvreplve0.w xr0, xr0 |
|
la.local t1, shufb |
|
xvld xr6, t1, 64 |
|
slli.d t0, a3, 1 //stride * 2 |
|
add.d t1, t0, a3 //stride * 3 |
|
addi.d a2, a2, -1 //src -= 1 |
|
.LOOP_UNI_W_H64_LASX: |
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 64 |
|
EPEL_UNI_W_H16_LOOP_LASX 16, 16, 64 |
|
EPEL_UNI_W_H16_LOOP_LASX 32, 32, 64 |
|
EPEL_UNI_W_H16_LOOP_LASX 48, 48, 64 |
|
add.d a2, a2, a3 |
|
add.d a0, a0, a1 |
|
addi.d a4, a4, -1 |
|
bnez a4, .LOOP_UNI_W_H64_LASX |
|
endfunc |
|
|
|
/* |
|
* void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, |
|
* const uint8_t *_src, ptrdiff_t _srcstride, |
|
* const int16_t *src2, int height, intptr_t mx, |
|
* intptr_t my, int width) |
|
*/ |
|
function ff_hevc_put_hevc_bi_epel_h4_8_lsx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
vreplvei.w vr0, vr0, 0 |
|
la.local t0, shufb |
|
vld vr1, t0, 0 // mask |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H4: |
|
vld vr4, a4, 0 // src2 |
|
vld vr5, a2, 0 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
vshuf.b vr5, vr5, vr5, vr1 |
|
vdp2.h.bu.b vr6, vr5, vr0 // EPEL_FILTER(src, 1) |
|
vsllwil.w.h vr4, vr4, 0 |
|
vhaddw.w.h vr6, vr6, vr6 |
|
vadd.w vr6, vr6, vr4 // src2[x] |
|
vssrani.h.w vr6, vr6, 0 |
|
vssrarni.bu.h vr6, vr6, 7 |
|
fst.s f6, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H4 |
|
endfunc |
|
|
|
.macro PUT_HEVC_BI_EPEL_H8_LSX in0, in1, in2, in3, out0 |
|
vshuf.b vr6, \in1, \in0, \in2 |
|
vshuf.b vr7, \in1, \in0, \in3 |
|
vdp2.h.bu.b vr8, vr6, vr0 // EPEL_FILTER(src, 1) |
|
vdp2add.h.bu.b vr8, vr7, vr1 // EPEL_FILTER(src, 1) |
|
vsadd.h \out0, vr8, vr4 // src2[x] |
|
.endm |
|
|
|
.macro PUT_HEVC_BI_EPEL_H16_LASX in0, in1, in2, in3, out0 |
|
xvshuf.b xr6, \in1, \in0, \in2 |
|
xvshuf.b xr7, \in1, \in0, \in3 |
|
xvdp2.h.bu.b xr8, xr6, xr0 // EPEL_FILTER(src, 1) |
|
xvdp2add.h.bu.b xr8, xr7, xr1 // EPEL_FILTER(src, 1) |
|
xvsadd.h \out0, xr8, xr4 // src2[x] |
|
.endm |
|
|
|
function ff_hevc_put_hevc_bi_epel_h6_8_lsx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
vreplvei.h vr1, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
la.local t0, shufb |
|
vld vr2, t0, 48// mask |
|
vaddi.bu vr3, vr2, 2 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H6: |
|
vld vr4, a4, 0 // src2 |
|
vld vr5, a2, 0 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr7 |
|
vssrarni.bu.h vr7, vr7, 7 |
|
fst.s f7, a0, 0 |
|
vstelm.h vr7, a0, 4, 2 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H6 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h8_8_lsx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
vreplvei.h vr1, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
la.local t0, shufb |
|
vld vr2, t0, 48// mask |
|
vaddi.bu vr3, vr2, 2 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H8: |
|
vld vr4, a4, 0 // src2 |
|
vld vr5, a2, 0 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr7 |
|
vssrarni.bu.h vr7, vr7, 7 |
|
fst.d f7, a0, 0 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H8 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h12_8_lsx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
vreplvei.h vr1, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
la.local t0, shufb |
|
vld vr2, t0, 48// mask |
|
vaddi.bu vr3, vr2, 2 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H12: |
|
vld vr4, a4, 0 // src2 |
|
vld vr5, a2, 0 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr11 |
|
vld vr5, a2, 8 |
|
vld vr4, a4, 16 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr12 |
|
vssrarni.bu.h vr12, vr11, 7 |
|
fst.d f12, a0, 0 |
|
vstelm.w vr12, a0, 8, 2 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H12 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h12_8_lasx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
la.local t0, shufb |
|
xvld xr2, t0, 96// mask |
|
xvaddi.bu xr3, xr2, 2 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H12_LASX: |
|
xvld xr4, a4, 0 // src2 |
|
xvld xr5, a2, 0 |
|
xvpermi.d xr5, xr5, 0x94 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr9 |
|
xvpermi.q xr10, xr9, 0x01 |
|
vssrarni.bu.h vr10, vr9, 7 |
|
fst.d f10, a0, 0 |
|
vstelm.w vr10, a0, 8, 2 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H12_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h16_8_lsx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
vreplvei.h vr1, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
la.local t0, shufb |
|
vld vr2, t0, 48// mask |
|
vaddi.bu vr3, vr2, 2 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H16: |
|
vld vr4, a4, 0 // src2 |
|
vld vr5, a2, 0 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr11 |
|
vld vr5, a2, 8 |
|
vld vr4, a4, 16 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr12 |
|
vssrarni.bu.h vr12, vr11, 7 |
|
vst vr12, a0, 0 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H16 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h16_8_lasx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
la.local t0, shufb |
|
xvld xr2, t0, 96// mask |
|
xvaddi.bu xr3, xr2, 2 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H16_LASX: |
|
xvld xr4, a4, 0 // src2 |
|
xvld xr5, a2, 0 |
|
xvpermi.d xr5, xr5, 0x94 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr9 |
|
xvpermi.q xr10, xr9, 0x01 |
|
vssrarni.bu.h vr10, vr9, 7 |
|
vst vr10, a0, 0 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H16_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h32_8_lasx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
la.local t0, shufb |
|
xvld xr2, t0, 96// mask |
|
xvaddi.bu xr3, xr2, 2 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H32_LASX: |
|
xvld xr4, a4, 0 // src2 |
|
xvld xr5, a2, 0 |
|
xvpermi.q xr15, xr5, 0x01 |
|
xvpermi.d xr5, xr5, 0x94 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr9 |
|
xvld xr4, a4, 32 |
|
xvld xr15, a2, 16 |
|
xvpermi.d xr15, xr15, 0x94 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr15, xr15, xr2, xr3, xr11 |
|
xvssrarni.bu.h xr11, xr9, 7 |
|
xvpermi.d xr11, xr11, 0xd8 |
|
xvst xr11, a0, 0 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H32_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h48_8_lsx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6// filter |
|
vreplvei.h vr1, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
la.local t0, shufb |
|
vld vr2, t0, 48// mask |
|
vaddi.bu vr3, vr2, 2 |
|
vaddi.bu vr21, vr2, 8 |
|
vaddi.bu vr22, vr2, 10 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H48: |
|
vld vr4, a4, 0 // src2 |
|
vld vr5, a2, 0 |
|
vld vr9, a2, 16 |
|
vld vr10, a2, 32 |
|
vld vr11, a2, 48 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr12 |
|
vld vr4, a4, 16 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr9, vr21, vr22, vr13 |
|
vld vr4, a4, 32 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr9, vr9, vr2, vr3, vr14 |
|
vld vr4, a4, 48 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr9, vr10, vr21, vr22, vr15 |
|
vld vr4, a4, 64 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr10, vr10, vr2, vr3, vr16 |
|
vld vr4, a4, 80 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr10, vr11, vr21, vr22, vr17 |
|
vssrarni.bu.h vr13, vr12, 7 |
|
vssrarni.bu.h vr15, vr14, 7 |
|
vssrarni.bu.h vr17, vr16, 7 |
|
vst vr13, a0, 0 |
|
vst vr15, a0, 16 |
|
vst vr17, a0, 32 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H48 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h48_8_lasx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
la.local t0, shufb |
|
xvld xr2, t0, 96// mask |
|
xvaddi.bu xr3, xr2, 2 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H48_LASX: |
|
xvld xr4, a4, 0 // src2 |
|
xvld xr5, a2, 0 |
|
xvld xr9, a2, 32 |
|
xvpermi.d xr10, xr9, 0x94 |
|
xvpermi.q xr9, xr5, 0x21 |
|
xvpermi.d xr9, xr9, 0x94 |
|
xvpermi.d xr5, xr5, 0x94 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr11 |
|
xvld xr4, a4, 32 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr9, xr9, xr2, xr3, xr12 |
|
xvld xr4, a4, 64 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr10, xr10, xr2, xr3, xr13 |
|
xvssrarni.bu.h xr12, xr11, 7 |
|
xvpermi.d xr12, xr12, 0xd8 |
|
xvpermi.q xr14, xr13, 0x01 |
|
vssrarni.bu.h vr14, vr13, 7 |
|
xvst xr12, a0, 0 |
|
vst vr14, a0, 32 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H48_LASX |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h64_8_lsx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6// filter |
|
vreplvei.h vr1, vr0, 1 |
|
vreplvei.h vr0, vr0, 0 |
|
la.local t0, shufb |
|
vld vr2, t0, 48// mask |
|
vaddi.bu vr3, vr2, 2 |
|
vaddi.bu vr21, vr2, 8 |
|
vaddi.bu vr22, vr2, 10 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H64: |
|
vld vr4, a4, 0 // src2 |
|
vld vr5, a2, 0 |
|
vld vr9, a2, 16 |
|
vld vr10, a2, 32 |
|
vld vr11, a2, 48 |
|
vld vr12, a2, 64 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr13 |
|
vld vr4, a4, 16 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr9, vr21, vr22, vr14 |
|
vld vr4, a4, 32 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr9, vr9, vr2, vr3, vr15 |
|
vld vr4, a4, 48 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr9, vr10, vr21, vr22, vr16 |
|
vld vr4, a4, 64 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr10, vr10, vr2, vr3, vr17 |
|
vld vr4, a4, 80 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr10, vr11, vr21, vr22, vr18 |
|
vld vr4, a4, 96 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr11, vr11, vr2, vr3, vr19 |
|
vld vr4, a4, 112 |
|
PUT_HEVC_BI_EPEL_H8_LSX vr11, vr12, vr21, vr22, vr20 |
|
vssrarni.bu.h vr14, vr13, 7 |
|
vssrarni.bu.h vr16, vr15, 7 |
|
vssrarni.bu.h vr18, vr17, 7 |
|
vssrarni.bu.h vr20, vr19, 7 |
|
vst vr14, a0, 0 |
|
vst vr16, a0, 16 |
|
vst vr18, a0, 32 |
|
vst vr20, a0, 48 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H64 |
|
endfunc |
|
|
|
function ff_hevc_put_hevc_bi_epel_h64_8_lasx |
|
addi.d a6, a6, -1 |
|
slli.w a6, a6, 2 |
|
la.local t0, ff_hevc_epel_filters |
|
vldx vr0, t0, a6 // filter |
|
xvreplve0.q xr0, xr0 |
|
xvrepl128vei.h xr1, xr0, 1 |
|
xvrepl128vei.h xr0, xr0, 0 |
|
la.local t0, shufb |
|
xvld xr2, t0, 96// mask |
|
xvaddi.bu xr3, xr2, 2 |
|
addi.d a2, a2, -1 // src -= 1 |
|
.LOOP_BI_EPEL_H64_LASX: |
|
xvld xr4, a4, 0 // src2 |
|
xvld xr5, a2, 0 |
|
xvld xr9, a2, 32 |
|
xvld xr11, a2, 48 |
|
xvpermi.d xr11, xr11, 0x94 |
|
xvpermi.d xr10, xr9, 0x94 |
|
xvpermi.q xr9, xr5, 0x21 |
|
xvpermi.d xr9, xr9, 0x94 |
|
xvpermi.d xr5, xr5, 0x94 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr12 |
|
xvld xr4, a4, 32 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr9, xr9, xr2, xr3, xr13 |
|
xvld xr4, a4, 64 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr10, xr10, xr2, xr3, xr14 |
|
xvld xr4, a4, 96 |
|
PUT_HEVC_BI_EPEL_H16_LASX xr11, xr11, xr2, xr3, xr15 |
|
xvssrarni.bu.h xr13, xr12, 7 |
|
xvssrarni.bu.h xr15, xr14, 7 |
|
xvpermi.d xr13, xr13, 0xd8 |
|
xvpermi.d xr15, xr15, 0xd8 |
|
xvst xr13, a0, 0 |
|
xvst xr15, a0, 32 |
|
add.d a2, a2, a3 |
|
addi.d a4, a4, 128 |
|
add.d a0, a0, a1 |
|
addi.d a5, a5, -1 |
|
bnez a5, .LOOP_BI_EPEL_H64_LASX |
|
endfunc
|
|
|